blob: 9826dc56f1e085f395c8ccf55b1a94d227de21e7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinnere6abb482012-05-02 01:15:40 +0200114/* Optimized version of Py_MAX() to compute the maximum character:
115 use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2) \
117 ((maxchar1) | (maxchar2))
118
Victor Stinner910337b2011-10-03 03:20:16 +0200119#undef PyUnicode_READY
120#define PyUnicode_READY(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200123 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100124 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
182static PyObject *interned;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200185static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000186
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* Single character Unicode strings in the Latin-1 range are being
191 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200192static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193
Christian Heimes190d79e2008-01-30 11:58:22 +0000194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000199/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000200/* case 0x000C: * FORM FEED */
201/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 1, 1, 1, 1, 1, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x001C: * FILE SEPARATOR */
205/* case 0x001D: * GROUP SEPARATOR */
206/* case 0x001E: * RECORD SEPARATOR */
207/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 1, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000214
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000223};
224
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200225/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200227static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200228static void copy_characters(
229 PyObject *to, Py_ssize_t to_start,
230 PyObject *from, Py_ssize_t from_start,
231 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100232static int unicode_modifiable(PyObject *unicode);
233
Victor Stinnerfe226c02011-10-03 03:52:20 +0200234
Alexander Belopolsky40018472011-02-26 01:02:56 +0000235static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200236unicode_fromascii(const unsigned char *s, Py_ssize_t size);
237static PyObject *
238_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
239static PyObject *
240_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
243
244static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000246 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100247 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000248 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
249
Alexander Belopolsky40018472011-02-26 01:02:56 +0000250static void
251raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300252 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100253 PyObject *unicode,
254 Py_ssize_t startpos, Py_ssize_t endpos,
255 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000256
Christian Heimes190d79e2008-01-30 11:58:22 +0000257/* Same for linebreaks */
258static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000261/* 0x000B, * LINE TABULATION */
262/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000263/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000264 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x001C, * FILE SEPARATOR */
267/* 0x001D, * GROUP SEPARATOR */
268/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 1, 1, 1, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000274
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000283};
284
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300285/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
286 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000288PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000289{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000290#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000292#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000293 /* This is actually an illegal character, so it should
294 not be passed to unichr. */
295 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296#endif
297}
298
Victor Stinner910337b2011-10-03 03:20:16 +0200299#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200300int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100301_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200302{
303 PyASCIIObject *ascii;
304 unsigned int kind;
305
306 assert(PyUnicode_Check(op));
307
308 ascii = (PyASCIIObject *)op;
309 kind = ascii->state.kind;
310
Victor Stinnera3b334d2011-10-03 13:53:37 +0200311 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200312 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(ascii->state.ready == 1);
314 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200315 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200316 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200317 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200318
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 if (ascii->state.compact == 1) {
320 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(kind == PyUnicode_1BYTE_KIND
322 || kind == PyUnicode_2BYTE_KIND
323 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200326 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 }
328 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
330
331 data = unicode->data.any;
332 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->length == 0);
334 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200335 assert(ascii->state.compact == 0);
336 assert(ascii->state.ascii == 0);
337 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100338 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 assert(ascii->wstr != NULL);
340 assert(data == NULL);
341 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200342 }
343 else {
344 assert(kind == PyUnicode_1BYTE_KIND
345 || kind == PyUnicode_2BYTE_KIND
346 || kind == PyUnicode_4BYTE_KIND);
347 assert(ascii->state.compact == 0);
348 assert(ascii->state.ready == 1);
349 assert(data != NULL);
350 if (ascii->state.ascii) {
351 assert (compact->utf8 == data);
352 assert (compact->utf8_length == ascii->length);
353 }
354 else
355 assert (compact->utf8 != data);
356 }
357 }
358 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200359 if (
360#if SIZEOF_WCHAR_T == 2
361 kind == PyUnicode_2BYTE_KIND
362#else
363 kind == PyUnicode_4BYTE_KIND
364#endif
365 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200366 {
367 assert(ascii->wstr == data);
368 assert(compact->wstr_length == ascii->length);
369 } else
370 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200372
373 if (compact->utf8 == NULL)
374 assert(compact->utf8_length == 0);
375 if (ascii->wstr == NULL)
376 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200377 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200378 /* check that the best kind is used */
379 if (check_content && kind != PyUnicode_WCHAR_KIND)
380 {
381 Py_ssize_t i;
382 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200383 void *data;
384 Py_UCS4 ch;
385
386 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 for (i=0; i < ascii->length; i++)
388 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200389 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 if (ch > maxchar)
391 maxchar = ch;
392 }
393 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100394 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100396 assert(maxchar <= 255);
397 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 else
399 assert(maxchar < 128);
400 }
Victor Stinner77faf692011-11-20 18:56:05 +0100401 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200402 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100403 assert(maxchar <= 0xFFFF);
404 }
405 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100407 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100408 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200409 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400411 return 1;
412}
Victor Stinner910337b2011-10-03 03:20:16 +0200413#endif
414
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100415static PyObject*
416unicode_result_wchar(PyObject *unicode)
417{
418#ifndef Py_DEBUG
419 Py_ssize_t len;
420
421 assert(Py_REFCNT(unicode) == 1);
422
423 len = _PyUnicode_WSTR_LENGTH(unicode);
424 if (len == 0) {
425 Py_INCREF(unicode_empty);
426 Py_DECREF(unicode);
427 return unicode_empty;
428 }
429
430 if (len == 1) {
431 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
432 if (ch < 256) {
433 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
434 Py_DECREF(unicode);
435 return latin1_char;
436 }
437 }
438
439 if (_PyUnicode_Ready(unicode) < 0) {
440 Py_XDECREF(unicode);
441 return NULL;
442 }
443#else
444 /* don't make the result ready in debug mode to ensure that the caller
445 makes the string ready before using it */
446 assert(_PyUnicode_CheckConsistency(unicode, 1));
447#endif
448 return unicode;
449}
450
451static PyObject*
452unicode_result_ready(PyObject *unicode)
453{
454 Py_ssize_t length;
455
456 length = PyUnicode_GET_LENGTH(unicode);
457 if (length == 0) {
458 if (unicode != unicode_empty) {
459 Py_INCREF(unicode_empty);
460 Py_DECREF(unicode);
461 }
462 return unicode_empty;
463 }
464
465 if (length == 1) {
466 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
467 if (ch < 256) {
468 PyObject *latin1_char = unicode_latin1[ch];
469 if (latin1_char != NULL) {
470 if (unicode != latin1_char) {
471 Py_INCREF(latin1_char);
472 Py_DECREF(unicode);
473 }
474 return latin1_char;
475 }
476 else {
477 assert(_PyUnicode_CheckConsistency(unicode, 1));
478 Py_INCREF(unicode);
479 unicode_latin1[ch] = unicode;
480 return unicode;
481 }
482 }
483 }
484
485 assert(_PyUnicode_CheckConsistency(unicode, 1));
486 return unicode;
487}
488
489static PyObject*
490unicode_result(PyObject *unicode)
491{
492 assert(_PyUnicode_CHECK(unicode));
493 if (PyUnicode_IS_READY(unicode))
494 return unicode_result_ready(unicode);
495 else
496 return unicode_result_wchar(unicode);
497}
498
Victor Stinnerc4b49542011-12-11 22:44:26 +0100499static PyObject*
500unicode_result_unchanged(PyObject *unicode)
501{
502 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500503 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100504 return NULL;
505 Py_INCREF(unicode);
506 return unicode;
507 }
508 else
509 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100510 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100511}
512
Victor Stinner3a50e702011-10-18 21:21:00 +0200513#ifdef HAVE_MBCS
514static OSVERSIONINFOEX winver;
515#endif
516
Thomas Wouters477c8d52006-05-27 19:21:47 +0000517/* --- Bloom Filters ----------------------------------------------------- */
518
519/* stuff to implement simple "bloom filters" for Unicode characters.
520 to keep things simple, we use a single bitmask, using the least 5
521 bits from each unicode characters as the bit index. */
522
523/* the linebreak mask is set up by Unicode_Init below */
524
Antoine Pitrouf068f942010-01-13 14:19:12 +0000525#if LONG_BIT >= 128
526#define BLOOM_WIDTH 128
527#elif LONG_BIT >= 64
528#define BLOOM_WIDTH 64
529#elif LONG_BIT >= 32
530#define BLOOM_WIDTH 32
531#else
532#error "LONG_BIT is smaller than 32"
533#endif
534
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535#define BLOOM_MASK unsigned long
536
537static BLOOM_MASK bloom_linebreak;
538
Antoine Pitrouf068f942010-01-13 14:19:12 +0000539#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
540#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000541
Benjamin Peterson29060642009-01-31 22:14:21 +0000542#define BLOOM_LINEBREAK(ch) \
543 ((ch) < 128U ? ascii_linebreak[(ch)] : \
544 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Alexander Belopolsky40018472011-02-26 01:02:56 +0000546Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548{
549 /* calculate simple bloom-style bitmask for a given unicode string */
550
Antoine Pitrouf068f942010-01-13 14:19:12 +0000551 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000552 Py_ssize_t i;
553
554 mask = 0;
555 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200556 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000557
558 return mask;
559}
560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561#define BLOOM_MEMBER(mask, chr, str) \
562 (BLOOM(mask, chr) \
563 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000564
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200565/* Compilation of templated routines */
566
567#include "stringlib/asciilib.h"
568#include "stringlib/fastsearch.h"
569#include "stringlib/partition.h"
570#include "stringlib/split.h"
571#include "stringlib/count.h"
572#include "stringlib/find.h"
573#include "stringlib/find_max_char.h"
574#include "stringlib/localeutil.h"
575#include "stringlib/undef.h"
576
577#include "stringlib/ucs1lib.h"
578#include "stringlib/fastsearch.h"
579#include "stringlib/partition.h"
580#include "stringlib/split.h"
581#include "stringlib/count.h"
582#include "stringlib/find.h"
583#include "stringlib/find_max_char.h"
584#include "stringlib/localeutil.h"
585#include "stringlib/undef.h"
586
587#include "stringlib/ucs2lib.h"
588#include "stringlib/fastsearch.h"
589#include "stringlib/partition.h"
590#include "stringlib/split.h"
591#include "stringlib/count.h"
592#include "stringlib/find.h"
593#include "stringlib/find_max_char.h"
594#include "stringlib/localeutil.h"
595#include "stringlib/undef.h"
596
597#include "stringlib/ucs4lib.h"
598#include "stringlib/fastsearch.h"
599#include "stringlib/partition.h"
600#include "stringlib/split.h"
601#include "stringlib/count.h"
602#include "stringlib/find.h"
603#include "stringlib/find_max_char.h"
604#include "stringlib/localeutil.h"
605#include "stringlib/undef.h"
606
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200607#include "stringlib/unicodedefs.h"
608#include "stringlib/fastsearch.h"
609#include "stringlib/count.h"
610#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100611#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612
Guido van Rossumd57fd912000-03-10 22:53:23 +0000613/* --- Unicode Object ----------------------------------------------------- */
614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200616fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200617
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200618Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
619 Py_ssize_t size, Py_UCS4 ch,
620 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200621{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200622 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
623
624 switch (kind) {
625 case PyUnicode_1BYTE_KIND:
626 {
627 Py_UCS1 ch1 = (Py_UCS1) ch;
628 if (ch1 == ch)
629 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
630 else
631 return -1;
632 }
633 case PyUnicode_2BYTE_KIND:
634 {
635 Py_UCS2 ch2 = (Py_UCS2) ch;
636 if (ch2 == ch)
637 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
638 else
639 return -1;
640 }
641 case PyUnicode_4BYTE_KIND:
642 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
643 default:
644 assert(0);
645 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647}
648
Victor Stinnerfe226c02011-10-03 03:52:20 +0200649static PyObject*
650resize_compact(PyObject *unicode, Py_ssize_t length)
651{
652 Py_ssize_t char_size;
653 Py_ssize_t struct_size;
654 Py_ssize_t new_size;
655 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100656 PyObject *new_unicode;
Victor Stinner79891572012-05-03 13:43:07 +0200657 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100659 assert(PyUnicode_IS_COMPACT(unicode));
660
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200661 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100662 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200663 struct_size = sizeof(PyASCIIObject);
664 else
665 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200666 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200667
Victor Stinnerfe226c02011-10-03 03:52:20 +0200668 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
669 PyErr_NoMemory();
670 return NULL;
671 }
672 new_size = (struct_size + (length + 1) * char_size);
673
Victor Stinner84def372011-12-11 20:04:56 +0100674 _Py_DEC_REFTOTAL;
675 _Py_ForgetReference(unicode);
676
677 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
678 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100679 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680 PyErr_NoMemory();
681 return NULL;
682 }
Victor Stinner84def372011-12-11 20:04:56 +0100683 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100685
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200687 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200688 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100689 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200690 _PyUnicode_WSTR_LENGTH(unicode) = length;
691 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
693 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200694 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 return unicode;
696}
697
Alexander Belopolsky40018472011-02-26 01:02:56 +0000698static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200699resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000700{
Victor Stinner95663112011-10-04 01:03:50 +0200701 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100702 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200703 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000705
Victor Stinnerfe226c02011-10-03 03:52:20 +0200706 if (PyUnicode_IS_READY(unicode)) {
707 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200708 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 void *data;
710
711 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200712 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200713 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
714 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715
716 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
717 PyErr_NoMemory();
718 return -1;
719 }
720 new_size = (length + 1) * char_size;
721
Victor Stinner7a9105a2011-12-12 00:13:42 +0100722 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
723 {
724 PyObject_DEL(_PyUnicode_UTF8(unicode));
725 _PyUnicode_UTF8(unicode) = NULL;
726 _PyUnicode_UTF8_LENGTH(unicode) = 0;
727 }
728
Victor Stinnerfe226c02011-10-03 03:52:20 +0200729 data = (PyObject *)PyObject_REALLOC(data, new_size);
730 if (data == NULL) {
731 PyErr_NoMemory();
732 return -1;
733 }
734 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200735 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200737 _PyUnicode_WSTR_LENGTH(unicode) = length;
738 }
739 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200740 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200741 _PyUnicode_UTF8_LENGTH(unicode) = length;
742 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200743 _PyUnicode_LENGTH(unicode) = length;
744 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200745 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200746 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200747 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200748 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200749 }
Victor Stinner95663112011-10-04 01:03:50 +0200750 assert(_PyUnicode_WSTR(unicode) != NULL);
751
752 /* check for integer overflow */
753 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
754 PyErr_NoMemory();
755 return -1;
756 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100757 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200758 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100759 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200760 if (!wstr) {
761 PyErr_NoMemory();
762 return -1;
763 }
764 _PyUnicode_WSTR(unicode) = wstr;
765 _PyUnicode_WSTR(unicode)[length] = 0;
766 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200767 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000768 return 0;
769}
770
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771static PyObject*
772resize_copy(PyObject *unicode, Py_ssize_t length)
773{
774 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100775 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100777
Benjamin Petersonbac79492012-01-14 13:34:47 -0500778 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100779 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200780
781 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
782 if (copy == NULL)
783 return NULL;
784
785 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200786 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200787 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200788 }
789 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200790 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100791
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200792 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200793 if (w == NULL)
794 return NULL;
795 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
796 copy_length = Py_MIN(copy_length, length);
797 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
798 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200799 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200800 }
801}
802
Guido van Rossumd57fd912000-03-10 22:53:23 +0000803/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000804 Ux0000 terminated; some code (e.g. new_identifier)
805 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000806
807 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000808 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809
810*/
811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200812#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200813static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200814#endif
815
Alexander Belopolsky40018472011-02-26 01:02:56 +0000816static PyUnicodeObject *
817_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000818{
819 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821
Thomas Wouters477c8d52006-05-27 19:21:47 +0000822 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823 if (length == 0 && unicode_empty != NULL) {
824 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200825 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000826 }
827
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000828 /* Ensure we won't overflow the size. */
829 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
830 return (PyUnicodeObject *)PyErr_NoMemory();
831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832 if (length < 0) {
833 PyErr_SetString(PyExc_SystemError,
834 "Negative size passed to _PyUnicode_New");
835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000836 }
837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200838#ifdef Py_DEBUG
839 ++unicode_old_new_calls;
840#endif
841
842 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
843 if (unicode == NULL)
844 return NULL;
845 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
846 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
847 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100848 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000849 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100850 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000851 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852
Jeremy Hyltond8082792003-09-16 19:41:39 +0000853 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000854 * the caller fails before initializing str -- unicode_resize()
855 * reads str[0], and the Keep-Alive optimization can keep memory
856 * allocated for str alive across a call to unicode_dealloc(unicode).
857 * We don't want unicode_resize to read uninitialized memory in
858 * that case.
859 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200860 _PyUnicode_WSTR(unicode)[0] = 0;
861 _PyUnicode_WSTR(unicode)[length] = 0;
862 _PyUnicode_WSTR_LENGTH(unicode) = length;
863 _PyUnicode_HASH(unicode) = -1;
864 _PyUnicode_STATE(unicode).interned = 0;
865 _PyUnicode_STATE(unicode).kind = 0;
866 _PyUnicode_STATE(unicode).compact = 0;
867 _PyUnicode_STATE(unicode).ready = 0;
868 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200869 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200870 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200871 _PyUnicode_UTF8(unicode) = NULL;
872 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100873 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000874 return unicode;
875}
876
Victor Stinnerf42dc442011-10-02 23:33:16 +0200877static const char*
878unicode_kind_name(PyObject *unicode)
879{
Victor Stinner42dfd712011-10-03 14:41:45 +0200880 /* don't check consistency: unicode_kind_name() is called from
881 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200882 if (!PyUnicode_IS_COMPACT(unicode))
883 {
884 if (!PyUnicode_IS_READY(unicode))
885 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600886 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887 {
888 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200889 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200890 return "legacy ascii";
891 else
892 return "legacy latin1";
893 case PyUnicode_2BYTE_KIND:
894 return "legacy UCS2";
895 case PyUnicode_4BYTE_KIND:
896 return "legacy UCS4";
897 default:
898 return "<legacy invalid kind>";
899 }
900 }
901 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600902 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200903 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200904 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200905 return "ascii";
906 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200907 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200908 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200909 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200910 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200911 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200912 default:
913 return "<invalid compact kind>";
914 }
915}
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200918static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200919
920/* Functions wrapping macros for use in debugger */
921char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200922 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923}
924
925void *_PyUnicode_compact_data(void *unicode) {
926 return _PyUnicode_COMPACT_DATA(unicode);
927}
928void *_PyUnicode_data(void *unicode){
929 printf("obj %p\n", unicode);
930 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
931 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
932 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
933 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
934 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
935 return PyUnicode_DATA(unicode);
936}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200937
938void
939_PyUnicode_Dump(PyObject *op)
940{
941 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
943 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
944 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200947 {
948 if (ascii->state.ascii)
949 data = (ascii + 1);
950 else
951 data = (compact + 1);
952 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200953 else
954 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200955 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
956
Victor Stinnera849a4b2011-10-03 12:12:11 +0200957 if (ascii->wstr == data)
958 printf("shared ");
959 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200960
Victor Stinnera3b334d2011-10-03 13:53:37 +0200961 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200962 printf(" (%zu), ", compact->wstr_length);
963 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
964 printf("shared ");
965 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200966 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200967 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200968}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969#endif
970
971PyObject *
972PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
973{
974 PyObject *obj;
975 PyCompactUnicodeObject *unicode;
976 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200977 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200978 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 Py_ssize_t char_size;
980 Py_ssize_t struct_size;
981
982 /* Optimization for empty strings */
983 if (size == 0 && unicode_empty != NULL) {
984 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200985 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200986 }
987
988#ifdef Py_DEBUG
989 ++unicode_new_new_calls;
990#endif
991
Victor Stinner9e9d6892011-10-04 01:02:02 +0200992 is_ascii = 0;
993 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200994 struct_size = sizeof(PyCompactUnicodeObject);
995 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200996 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200997 char_size = 1;
998 is_ascii = 1;
999 struct_size = sizeof(PyASCIIObject);
1000 }
1001 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001002 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001003 char_size = 1;
1004 }
1005 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001006 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001007 char_size = 2;
1008 if (sizeof(wchar_t) == 2)
1009 is_sharing = 1;
1010 }
1011 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001012 if (maxchar > MAX_UNICODE) {
1013 PyErr_SetString(PyExc_SystemError,
1014 "invalid maximum character passed to PyUnicode_New");
1015 return NULL;
1016 }
Victor Stinner8f825062012-04-27 13:55:39 +02001017 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001018 char_size = 4;
1019 if (sizeof(wchar_t) == 4)
1020 is_sharing = 1;
1021 }
1022
1023 /* Ensure we won't overflow the size. */
1024 if (size < 0) {
1025 PyErr_SetString(PyExc_SystemError,
1026 "Negative size passed to PyUnicode_New");
1027 return NULL;
1028 }
1029 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1030 return PyErr_NoMemory();
1031
1032 /* Duplicated allocation code from _PyObject_New() instead of a call to
1033 * PyObject_New() so we are able to allocate space for the object and
1034 * it's data buffer.
1035 */
1036 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1037 if (obj == NULL)
1038 return PyErr_NoMemory();
1039 obj = PyObject_INIT(obj, &PyUnicode_Type);
1040 if (obj == NULL)
1041 return NULL;
1042
1043 unicode = (PyCompactUnicodeObject *)obj;
1044 if (is_ascii)
1045 data = ((PyASCIIObject*)obj) + 1;
1046 else
1047 data = unicode + 1;
1048 _PyUnicode_LENGTH(unicode) = size;
1049 _PyUnicode_HASH(unicode) = -1;
1050 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001051 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 _PyUnicode_STATE(unicode).compact = 1;
1053 _PyUnicode_STATE(unicode).ready = 1;
1054 _PyUnicode_STATE(unicode).ascii = is_ascii;
1055 if (is_ascii) {
1056 ((char*)data)[size] = 0;
1057 _PyUnicode_WSTR(unicode) = NULL;
1058 }
Victor Stinner8f825062012-04-27 13:55:39 +02001059 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 ((char*)data)[size] = 0;
1061 _PyUnicode_WSTR(unicode) = NULL;
1062 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001064 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001065 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066 else {
1067 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001068 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001069 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001071 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 ((Py_UCS4*)data)[size] = 0;
1073 if (is_sharing) {
1074 _PyUnicode_WSTR_LENGTH(unicode) = size;
1075 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1076 }
1077 else {
1078 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1079 _PyUnicode_WSTR(unicode) = NULL;
1080 }
1081 }
Victor Stinner8f825062012-04-27 13:55:39 +02001082#ifdef Py_DEBUG
1083 /* Fill the data with invalid characters to detect bugs earlier.
1084 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1085 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1086 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1087 memset(data, 0xff, size * kind);
1088#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001089 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 return obj;
1091}
1092
1093#if SIZEOF_WCHAR_T == 2
1094/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1095 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001096 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097
1098 This function assumes that unicode can hold one more code point than wstr
1099 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001100static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103{
1104 const wchar_t *iter;
1105 Py_UCS4 *ucs4_out;
1106
Victor Stinner910337b2011-10-03 03:20:16 +02001107 assert(unicode != NULL);
1108 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1110 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1111
1112 for (iter = begin; iter < end; ) {
1113 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1114 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001115 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1116 && (iter+1) < end
1117 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 {
Victor Stinner551ac952011-11-29 22:58:13 +01001119 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 iter += 2;
1121 }
1122 else {
1123 *ucs4_out++ = *iter;
1124 iter++;
1125 }
1126 }
1127 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1128 _PyUnicode_GET_LENGTH(unicode)));
1129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130}
1131#endif
1132
Victor Stinnercd9950f2011-10-02 00:34:53 +02001133static int
Victor Stinner488fa492011-12-12 00:01:39 +01001134unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001135{
Victor Stinner488fa492011-12-12 00:01:39 +01001136 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001137 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001138 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001139 return -1;
1140 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001141 return 0;
1142}
1143
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001144static int
1145_copy_characters(PyObject *to, Py_ssize_t to_start,
1146 PyObject *from, Py_ssize_t from_start,
1147 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001149 unsigned int from_kind, to_kind;
1150 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001151 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152
Victor Stinneree4544c2012-05-09 22:24:08 +02001153 assert(0 <= how_many);
1154 assert(0 <= from_start);
1155 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001156 assert(PyUnicode_Check(from));
1157 assert(PyUnicode_Check(to));
1158 assert(PyUnicode_IS_READY(from));
1159 assert(PyUnicode_IS_READY(to));
Victor Stinneree4544c2012-05-09 22:24:08 +02001160 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001161 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001163 if (how_many == 0)
1164 return 0;
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001167 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001169 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001171#ifdef Py_DEBUG
1172 if (!check_maxchar
1173 && (from_kind > to_kind
1174 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001175 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001176 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1177 Py_UCS4 ch;
1178 Py_ssize_t i;
1179 for (i=0; i < how_many; i++) {
1180 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1181 assert(ch <= to_maxchar);
1182 }
1183 }
1184#endif
1185 fast = (from_kind == to_kind);
1186 if (check_maxchar
1187 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1188 {
1189 /* deny latin1 => ascii */
1190 fast = 0;
1191 }
1192
1193 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001194 Py_MEMCPY((char*)to_data + to_kind * to_start,
1195 (char*)from_data + from_kind * from_start,
1196 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001197 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001198 else if (from_kind == PyUnicode_1BYTE_KIND
1199 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001200 {
1201 _PyUnicode_CONVERT_BYTES(
1202 Py_UCS1, Py_UCS2,
1203 PyUnicode_1BYTE_DATA(from) + from_start,
1204 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1205 PyUnicode_2BYTE_DATA(to) + to_start
1206 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001207 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001208 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001209 && to_kind == PyUnicode_4BYTE_KIND)
1210 {
1211 _PyUnicode_CONVERT_BYTES(
1212 Py_UCS1, Py_UCS4,
1213 PyUnicode_1BYTE_DATA(from) + from_start,
1214 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1215 PyUnicode_4BYTE_DATA(to) + to_start
1216 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001217 }
1218 else if (from_kind == PyUnicode_2BYTE_KIND
1219 && to_kind == PyUnicode_4BYTE_KIND)
1220 {
1221 _PyUnicode_CONVERT_BYTES(
1222 Py_UCS2, Py_UCS4,
1223 PyUnicode_2BYTE_DATA(from) + from_start,
1224 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1225 PyUnicode_4BYTE_DATA(to) + to_start
1226 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001227 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001228 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001229 /* check if max_char(from substring) <= max_char(to) */
1230 if (from_kind > to_kind
1231 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001232 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001233 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001234 /* slow path to check for character overflow */
1235 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001236 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 Py_ssize_t i;
1238
Victor Stinner56c161a2011-10-06 02:47:11 +02001239#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001240 for (i=0; i < how_many; i++) {
1241 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001242 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001243 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1244 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001245#else
1246 if (!check_maxchar) {
1247 for (i=0; i < how_many; i++) {
1248 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1249 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1250 }
1251 }
1252 else {
1253 for (i=0; i < how_many; i++) {
1254 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1255 if (ch > to_maxchar)
1256 return 1;
1257 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1258 }
1259 }
1260#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001261 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001262 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001263 assert(0 && "inconsistent state");
1264 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001265 }
1266 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001267 return 0;
1268}
1269
1270static void
1271copy_characters(PyObject *to, Py_ssize_t to_start,
1272 PyObject *from, Py_ssize_t from_start,
1273 Py_ssize_t how_many)
1274{
1275 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1276}
1277
1278Py_ssize_t
1279PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1280 PyObject *from, Py_ssize_t from_start,
1281 Py_ssize_t how_many)
1282{
1283 int err;
1284
1285 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1286 PyErr_BadInternalCall();
1287 return -1;
1288 }
1289
Benjamin Petersonbac79492012-01-14 13:34:47 -05001290 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001291 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001292 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001293 return -1;
1294
1295 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1296 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1297 PyErr_Format(PyExc_SystemError,
1298 "Cannot write %zi characters at %zi "
1299 "in a string of %zi characters",
1300 how_many, to_start, PyUnicode_GET_LENGTH(to));
1301 return -1;
1302 }
1303
1304 if (how_many == 0)
1305 return 0;
1306
Victor Stinner488fa492011-12-12 00:01:39 +01001307 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001308 return -1;
1309
1310 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1311 if (err) {
1312 PyErr_Format(PyExc_SystemError,
1313 "Cannot copy %s characters "
1314 "into a string of %s characters",
1315 unicode_kind_name(from),
1316 unicode_kind_name(to));
1317 return -1;
1318 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001319 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320}
1321
Victor Stinner17222162011-09-28 22:15:37 +02001322/* Find the maximum code point and count the number of surrogate pairs so a
1323 correct string length can be computed before converting a string to UCS4.
1324 This function counts single surrogates as a character and not as a pair.
1325
1326 Return 0 on success, or -1 on error. */
1327static int
1328find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1329 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330{
1331 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001332 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333
Victor Stinnerc53be962011-10-02 21:33:54 +02001334 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001335 *num_surrogates = 0;
1336 *maxchar = 0;
1337
1338 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001340 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1341 && (iter+1) < end
1342 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001344 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 iter += 2;
1347 }
1348 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001350 {
1351 ch = *iter;
1352 iter++;
1353 }
1354 if (ch > *maxchar) {
1355 *maxchar = ch;
1356 if (*maxchar > MAX_UNICODE) {
1357 PyErr_Format(PyExc_ValueError,
1358 "character U+%x is not in range [U+0000; U+10ffff]",
1359 ch);
1360 return -1;
1361 }
1362 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 }
1364 return 0;
1365}
1366
1367#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001368static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369#endif
1370
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001371int
1372_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373{
1374 wchar_t *end;
1375 Py_UCS4 maxchar = 0;
1376 Py_ssize_t num_surrogates;
1377#if SIZEOF_WCHAR_T == 2
1378 Py_ssize_t length_wo_surrogates;
1379#endif
1380
Georg Brandl7597add2011-10-05 16:36:47 +02001381 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001382 strings were created using _PyObject_New() and where no canonical
1383 representation (the str field) has been set yet aka strings
1384 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001385 assert(_PyUnicode_CHECK(unicode));
1386 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001388 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001389 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001390 /* Actually, it should neither be interned nor be anything else: */
1391 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392
1393#ifdef Py_DEBUG
1394 ++unicode_ready_calls;
1395#endif
1396
1397 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001398 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001399 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401
1402 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001403 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1404 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 PyErr_NoMemory();
1406 return -1;
1407 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001408 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 _PyUnicode_WSTR(unicode), end,
1410 PyUnicode_1BYTE_DATA(unicode));
1411 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1412 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1413 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1414 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001415 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001416 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001417 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418 }
1419 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001420 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001421 _PyUnicode_UTF8(unicode) = NULL;
1422 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001423 }
1424 PyObject_FREE(_PyUnicode_WSTR(unicode));
1425 _PyUnicode_WSTR(unicode) = NULL;
1426 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1427 }
1428 /* In this case we might have to convert down from 4-byte native
1429 wchar_t to 2-byte unicode. */
1430 else if (maxchar < 65536) {
1431 assert(num_surrogates == 0 &&
1432 "FindMaxCharAndNumSurrogatePairs() messed up");
1433
Victor Stinner506f5922011-09-28 22:34:18 +02001434#if SIZEOF_WCHAR_T == 2
1435 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001436 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001437 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1438 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1439 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001440 _PyUnicode_UTF8(unicode) = NULL;
1441 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001442#else
1443 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001444 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001445 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001446 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001447 PyErr_NoMemory();
1448 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449 }
Victor Stinner506f5922011-09-28 22:34:18 +02001450 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1451 _PyUnicode_WSTR(unicode), end,
1452 PyUnicode_2BYTE_DATA(unicode));
1453 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1454 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1455 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001456 _PyUnicode_UTF8(unicode) = NULL;
1457 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001458 PyObject_FREE(_PyUnicode_WSTR(unicode));
1459 _PyUnicode_WSTR(unicode) = NULL;
1460 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1461#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 }
1463 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1464 else {
1465#if SIZEOF_WCHAR_T == 2
1466 /* in case the native representation is 2-bytes, we need to allocate a
1467 new normalized 4-byte version. */
1468 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001469 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1470 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 PyErr_NoMemory();
1472 return -1;
1473 }
1474 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1475 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001476 _PyUnicode_UTF8(unicode) = NULL;
1477 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001478 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1479 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001480 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 PyObject_FREE(_PyUnicode_WSTR(unicode));
1482 _PyUnicode_WSTR(unicode) = NULL;
1483 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1484#else
1485 assert(num_surrogates == 0);
1486
Victor Stinnerc3c74152011-10-02 20:39:55 +02001487 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001489 _PyUnicode_UTF8(unicode) = NULL;
1490 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001491 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1492#endif
1493 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1494 }
1495 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001496 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497 return 0;
1498}
1499
Alexander Belopolsky40018472011-02-26 01:02:56 +00001500static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001501unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502{
Walter Dörwald16807132007-05-25 13:52:07 +00001503 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001504 case SSTATE_NOT_INTERNED:
1505 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001506
Benjamin Peterson29060642009-01-31 22:14:21 +00001507 case SSTATE_INTERNED_MORTAL:
1508 /* revive dead object temporarily for DelItem */
1509 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001510 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001511 Py_FatalError(
1512 "deletion of interned string failed");
1513 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001514
Benjamin Peterson29060642009-01-31 22:14:21 +00001515 case SSTATE_INTERNED_IMMORTAL:
1516 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001517
Benjamin Peterson29060642009-01-31 22:14:21 +00001518 default:
1519 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001520 }
1521
Victor Stinner03490912011-10-03 23:45:12 +02001522 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001524 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001525 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001526 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1527 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001528
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001529 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001530}
1531
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001532#ifdef Py_DEBUG
1533static int
1534unicode_is_singleton(PyObject *unicode)
1535{
1536 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1537 if (unicode == unicode_empty)
1538 return 1;
1539 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1540 {
1541 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1542 if (ch < 256 && unicode_latin1[ch] == unicode)
1543 return 1;
1544 }
1545 return 0;
1546}
1547#endif
1548
Alexander Belopolsky40018472011-02-26 01:02:56 +00001549static int
Victor Stinner488fa492011-12-12 00:01:39 +01001550unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001551{
Victor Stinner488fa492011-12-12 00:01:39 +01001552 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001553 if (Py_REFCNT(unicode) != 1)
1554 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001555 if (_PyUnicode_HASH(unicode) != -1)
1556 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001557 if (PyUnicode_CHECK_INTERNED(unicode))
1558 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001559 if (!PyUnicode_CheckExact(unicode))
1560 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001561#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001562 /* singleton refcount is greater than 1 */
1563 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001564#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001565 return 1;
1566}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001567
Victor Stinnerfe226c02011-10-03 03:52:20 +02001568static int
1569unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1570{
1571 PyObject *unicode;
1572 Py_ssize_t old_length;
1573
1574 assert(p_unicode != NULL);
1575 unicode = *p_unicode;
1576
1577 assert(unicode != NULL);
1578 assert(PyUnicode_Check(unicode));
1579 assert(0 <= length);
1580
Victor Stinner910337b2011-10-03 03:20:16 +02001581 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001582 old_length = PyUnicode_WSTR_LENGTH(unicode);
1583 else
1584 old_length = PyUnicode_GET_LENGTH(unicode);
1585 if (old_length == length)
1586 return 0;
1587
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001588 if (length == 0) {
1589 Py_DECREF(*p_unicode);
1590 *p_unicode = unicode_empty;
1591 Py_INCREF(*p_unicode);
1592 return 0;
1593 }
1594
Victor Stinner488fa492011-12-12 00:01:39 +01001595 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001596 PyObject *copy = resize_copy(unicode, length);
1597 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001598 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001599 Py_DECREF(*p_unicode);
1600 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001601 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001602 }
1603
Victor Stinnerfe226c02011-10-03 03:52:20 +02001604 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001605 PyObject *new_unicode = resize_compact(unicode, length);
1606 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001607 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001608 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001609 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001610 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001611 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001612}
1613
Alexander Belopolsky40018472011-02-26 01:02:56 +00001614int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001616{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001617 PyObject *unicode;
1618 if (p_unicode == NULL) {
1619 PyErr_BadInternalCall();
1620 return -1;
1621 }
1622 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001623 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 {
1625 PyErr_BadInternalCall();
1626 return -1;
1627 }
1628 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001629}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001630
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001631static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001632unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1633 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001634{
1635 PyObject *result;
1636 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001637 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001638 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1639 return 0;
1640 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1641 maxchar);
1642 if (result == NULL)
1643 return -1;
Victor Stinner1b487b42012-05-03 12:29:04 +02001644 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001645 Py_DECREF(*p_unicode);
1646 *p_unicode = result;
1647 return 0;
1648}
1649
1650static int
1651unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1652 Py_UCS4 ch)
1653{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001654 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001655 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001656 return -1;
1657 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1658 PyUnicode_DATA(*p_unicode),
1659 (*pos)++, ch);
1660 return 0;
1661}
1662
Victor Stinnerc5166102012-02-22 13:55:02 +01001663/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1664 Return the length of the input string.
1665
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001666 WARNING: The function doesn't copy the terminating null character and
1667 doesn't check the maximum character (may write a latin1 character in an
1668 ASCII string). */
Victor Stinnerc5166102012-02-22 13:55:02 +01001669static Py_ssize_t
1670unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1671{
1672 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1673 void *data = PyUnicode_DATA(unicode);
1674
1675 switch (kind) {
1676 case PyUnicode_1BYTE_KIND: {
1677 Py_ssize_t len = strlen(str);
1678 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001679 memcpy((char *) data + index, str, len);
Victor Stinnerc5166102012-02-22 13:55:02 +01001680 return len;
1681 }
1682 case PyUnicode_2BYTE_KIND: {
1683 Py_UCS2 *start = (Py_UCS2 *)data + index;
1684 Py_UCS2 *ucs2 = start;
1685 assert(index <= PyUnicode_GET_LENGTH(unicode));
1686
1687 for (; *str; ++ucs2, ++str)
1688 *ucs2 = (Py_UCS2)*str;
1689
1690 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1691 return ucs2 - start;
1692 }
1693 default: {
1694 Py_UCS4 *start = (Py_UCS4 *)data + index;
1695 Py_UCS4 *ucs4 = start;
1696 assert(kind == PyUnicode_4BYTE_KIND);
1697 assert(index <= PyUnicode_GET_LENGTH(unicode));
1698
1699 for (; *str; ++ucs4, ++str)
1700 *ucs4 = (Py_UCS4)*str;
1701
1702 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1703 return ucs4 - start;
1704 }
1705 }
1706}
1707
1708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709static PyObject*
1710get_latin1_char(unsigned char ch)
1711{
Victor Stinnera464fc12011-10-02 20:39:30 +02001712 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001714 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001715 if (!unicode)
1716 return NULL;
1717 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001718 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 unicode_latin1[ch] = unicode;
1720 }
1721 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001722 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723}
1724
Alexander Belopolsky40018472011-02-26 01:02:56 +00001725PyObject *
1726PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001728 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 Py_UCS4 maxchar = 0;
1730 Py_ssize_t num_surrogates;
1731
1732 if (u == NULL)
1733 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001735 /* If the Unicode data is known at construction time, we can apply
1736 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 /* Optimization for empty strings */
1739 if (size == 0 && unicode_empty != NULL) {
1740 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001741 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001742 }
Tim Petersced69f82003-09-16 20:30:58 +00001743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 /* Single character Unicode objects in the Latin-1 range are
1745 shared when using this constructor */
1746 if (size == 1 && *u < 256)
1747 return get_latin1_char((unsigned char)*u);
1748
1749 /* If not empty and not single character, copy the Unicode data
1750 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001751 if (find_maxchar_surrogates(u, u + size,
1752 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 return NULL;
1754
Victor Stinner8faf8212011-12-08 22:14:11 +01001755 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756 if (!unicode)
1757 return NULL;
1758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759 switch (PyUnicode_KIND(unicode)) {
1760 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001761 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1763 break;
1764 case PyUnicode_2BYTE_KIND:
1765#if Py_UNICODE_SIZE == 2
1766 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1767#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001768 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1770#endif
1771 break;
1772 case PyUnicode_4BYTE_KIND:
1773#if SIZEOF_WCHAR_T == 2
1774 /* This is the only case which has to process surrogates, thus
1775 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001776 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777#else
1778 assert(num_surrogates == 0);
1779 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1780#endif
1781 break;
1782 default:
1783 assert(0 && "Impossible state");
1784 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001786 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787}
1788
Alexander Belopolsky40018472011-02-26 01:02:56 +00001789PyObject *
1790PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001791{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001792 if (size < 0) {
1793 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001794 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001795 return NULL;
1796 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001797 if (u != NULL)
1798 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1799 else
1800 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001801}
1802
Alexander Belopolsky40018472011-02-26 01:02:56 +00001803PyObject *
1804PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001805{
1806 size_t size = strlen(u);
1807 if (size > PY_SSIZE_T_MAX) {
1808 PyErr_SetString(PyExc_OverflowError, "input too long");
1809 return NULL;
1810 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001811 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001812}
1813
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001814PyObject *
1815_PyUnicode_FromId(_Py_Identifier *id)
1816{
1817 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001818 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1819 strlen(id->string),
1820 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001821 if (!id->object)
1822 return NULL;
1823 PyUnicode_InternInPlace(&id->object);
1824 assert(!id->next);
1825 id->next = static_strings;
1826 static_strings = id;
1827 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001828 return id->object;
1829}
1830
1831void
1832_PyUnicode_ClearStaticStrings()
1833{
1834 _Py_Identifier *i;
1835 for (i = static_strings; i; i = i->next) {
1836 Py_DECREF(i->object);
1837 i->object = NULL;
1838 i->next = NULL;
1839 }
1840}
1841
Benjamin Peterson0df54292012-03-26 14:50:32 -04001842/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001843
Victor Stinnere57b1c02011-09-28 22:20:48 +02001844static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001845unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001846{
Victor Stinner785938e2011-12-11 20:09:03 +01001847 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001848 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001849#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001850 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001851#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001852 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001853 }
Victor Stinner785938e2011-12-11 20:09:03 +01001854 unicode = PyUnicode_New(size, 127);
1855 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001856 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001857 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1858 assert(_PyUnicode_CheckConsistency(unicode, 1));
1859 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001860}
1861
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001862static Py_UCS4
1863kind_maxchar_limit(unsigned int kind)
1864{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001865 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001866 case PyUnicode_1BYTE_KIND:
1867 return 0x80;
1868 case PyUnicode_2BYTE_KIND:
1869 return 0x100;
1870 case PyUnicode_4BYTE_KIND:
1871 return 0x10000;
1872 default:
1873 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001874 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001875 }
1876}
1877
Victor Stinnere6abb482012-05-02 01:15:40 +02001878Py_LOCAL_INLINE(Py_UCS4)
1879align_maxchar(Py_UCS4 maxchar)
1880{
1881 if (maxchar <= 127)
1882 return 127;
1883 else if (maxchar <= 255)
1884 return 255;
1885 else if (maxchar <= 65535)
1886 return 65535;
1887 else
1888 return MAX_UNICODE;
1889}
1890
Victor Stinner702c7342011-10-05 13:50:52 +02001891static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001892_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001893{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001894 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001895 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001896
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001897 if (size == 0) {
1898 Py_INCREF(unicode_empty);
1899 return unicode_empty;
1900 }
1901 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001902 if (size == 1)
1903 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001904
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001905 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001906 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 if (!res)
1908 return NULL;
1909 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001910 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001911 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001912}
1913
Victor Stinnere57b1c02011-09-28 22:20:48 +02001914static PyObject*
1915_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001916{
1917 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001918 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001919
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001920 if (size == 0) {
1921 Py_INCREF(unicode_empty);
1922 return unicode_empty;
1923 }
1924 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001925 if (size == 1) {
1926 Py_UCS4 ch = u[0];
1927 if (ch < 256)
1928 return get_latin1_char((unsigned char)ch);
1929
1930 res = PyUnicode_New(1, ch);
1931 if (res == NULL)
1932 return NULL;
1933 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1934 assert(_PyUnicode_CheckConsistency(res, 1));
1935 return res;
1936 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001937
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001938 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001939 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940 if (!res)
1941 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001942 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001944 else {
1945 _PyUnicode_CONVERT_BYTES(
1946 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1947 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001948 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001949 return res;
1950}
1951
Victor Stinnere57b1c02011-09-28 22:20:48 +02001952static PyObject*
1953_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954{
1955 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001956 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001957
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001958 if (size == 0) {
1959 Py_INCREF(unicode_empty);
1960 return unicode_empty;
1961 }
1962 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001963 if (size == 1) {
1964 Py_UCS4 ch = u[0];
1965 if (ch < 256)
1966 return get_latin1_char((unsigned char)ch);
1967
1968 res = PyUnicode_New(1, ch);
1969 if (res == NULL)
1970 return NULL;
1971 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1972 assert(_PyUnicode_CheckConsistency(res, 1));
1973 return res;
1974 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001975
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001976 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001977 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978 if (!res)
1979 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001980 if (max_char < 256)
1981 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1982 PyUnicode_1BYTE_DATA(res));
1983 else if (max_char < 0x10000)
1984 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1985 PyUnicode_2BYTE_DATA(res));
1986 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001988 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989 return res;
1990}
1991
1992PyObject*
1993PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1994{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001995 if (size < 0) {
1996 PyErr_SetString(PyExc_ValueError, "size must be positive");
1997 return NULL;
1998 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001999 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002001 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002003 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002005 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002006 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002007 PyErr_SetString(PyExc_SystemError, "invalid kind");
2008 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010}
2011
Victor Stinnerece58de2012-04-23 23:36:38 +02002012Py_UCS4
2013_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2014{
2015 enum PyUnicode_Kind kind;
2016 void *startptr, *endptr;
2017
2018 assert(PyUnicode_IS_READY(unicode));
2019 assert(0 <= start);
2020 assert(end <= PyUnicode_GET_LENGTH(unicode));
2021 assert(start <= end);
2022
2023 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2024 return PyUnicode_MAX_CHAR_VALUE(unicode);
2025
2026 if (start == end)
2027 return 127;
2028
Victor Stinner94d558b2012-04-27 22:26:58 +02002029 if (PyUnicode_IS_ASCII(unicode))
2030 return 127;
2031
Victor Stinnerece58de2012-04-23 23:36:38 +02002032 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002033 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002034 endptr = (char *)startptr + end * kind;
2035 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002036 switch(kind) {
2037 case PyUnicode_1BYTE_KIND:
2038 return ucs1lib_find_max_char(startptr, endptr);
2039 case PyUnicode_2BYTE_KIND:
2040 return ucs2lib_find_max_char(startptr, endptr);
2041 case PyUnicode_4BYTE_KIND:
2042 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002043 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002044 assert(0);
2045 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002046 }
2047}
2048
Victor Stinner25a4b292011-10-06 12:31:55 +02002049/* Ensure that a string uses the most efficient storage, if it is not the
2050 case: create a new string with of the right kind. Write NULL into *p_unicode
2051 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002052static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002053unicode_adjust_maxchar(PyObject **p_unicode)
2054{
2055 PyObject *unicode, *copy;
2056 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002057 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002058 unsigned int kind;
2059
2060 assert(p_unicode != NULL);
2061 unicode = *p_unicode;
2062 assert(PyUnicode_IS_READY(unicode));
2063 if (PyUnicode_IS_ASCII(unicode))
2064 return;
2065
2066 len = PyUnicode_GET_LENGTH(unicode);
2067 kind = PyUnicode_KIND(unicode);
2068 if (kind == PyUnicode_1BYTE_KIND) {
2069 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002070 max_char = ucs1lib_find_max_char(u, u + len);
2071 if (max_char >= 128)
2072 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002073 }
2074 else if (kind == PyUnicode_2BYTE_KIND) {
2075 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002076 max_char = ucs2lib_find_max_char(u, u + len);
2077 if (max_char >= 256)
2078 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002079 }
2080 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002081 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002082 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002083 max_char = ucs4lib_find_max_char(u, u + len);
2084 if (max_char >= 0x10000)
2085 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002086 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002087 copy = PyUnicode_New(len, max_char);
2088 copy_characters(copy, 0, unicode, 0, len);
2089 Py_DECREF(unicode);
2090 *p_unicode = copy;
2091}
2092
Victor Stinner034f6cf2011-09-30 02:26:44 +02002093PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002094_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002095{
Victor Stinner87af4f22011-11-21 23:03:47 +01002096 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002097 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002098
Victor Stinner034f6cf2011-09-30 02:26:44 +02002099 if (!PyUnicode_Check(unicode)) {
2100 PyErr_BadInternalCall();
2101 return NULL;
2102 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002103 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002104 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002105
Victor Stinner87af4f22011-11-21 23:03:47 +01002106 length = PyUnicode_GET_LENGTH(unicode);
2107 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002108 if (!copy)
2109 return NULL;
2110 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2111
Victor Stinner87af4f22011-11-21 23:03:47 +01002112 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2113 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002114 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002115 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002116}
2117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002118
Victor Stinnerbc603d12011-10-02 01:00:40 +02002119/* Widen Unicode objects to larger buffers. Don't write terminating null
2120 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002121
2122void*
2123_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2124{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002125 Py_ssize_t len;
2126 void *result;
2127 unsigned int skind;
2128
Benjamin Petersonbac79492012-01-14 13:34:47 -05002129 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002130 return NULL;
2131
2132 len = PyUnicode_GET_LENGTH(s);
2133 skind = PyUnicode_KIND(s);
2134 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002135 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002136 return NULL;
2137 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002138 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002139 case PyUnicode_2BYTE_KIND:
2140 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2141 if (!result)
2142 return PyErr_NoMemory();
2143 assert(skind == PyUnicode_1BYTE_KIND);
2144 _PyUnicode_CONVERT_BYTES(
2145 Py_UCS1, Py_UCS2,
2146 PyUnicode_1BYTE_DATA(s),
2147 PyUnicode_1BYTE_DATA(s) + len,
2148 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002150 case PyUnicode_4BYTE_KIND:
2151 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2152 if (!result)
2153 return PyErr_NoMemory();
2154 if (skind == PyUnicode_2BYTE_KIND) {
2155 _PyUnicode_CONVERT_BYTES(
2156 Py_UCS2, Py_UCS4,
2157 PyUnicode_2BYTE_DATA(s),
2158 PyUnicode_2BYTE_DATA(s) + len,
2159 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002160 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002161 else {
2162 assert(skind == PyUnicode_1BYTE_KIND);
2163 _PyUnicode_CONVERT_BYTES(
2164 Py_UCS1, Py_UCS4,
2165 PyUnicode_1BYTE_DATA(s),
2166 PyUnicode_1BYTE_DATA(s) + len,
2167 result);
2168 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002170 default:
2171 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 }
Victor Stinner01698042011-10-04 00:04:26 +02002173 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174 return NULL;
2175}
2176
2177static Py_UCS4*
2178as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2179 int copy_null)
2180{
2181 int kind;
2182 void *data;
2183 Py_ssize_t len, targetlen;
2184 if (PyUnicode_READY(string) == -1)
2185 return NULL;
2186 kind = PyUnicode_KIND(string);
2187 data = PyUnicode_DATA(string);
2188 len = PyUnicode_GET_LENGTH(string);
2189 targetlen = len;
2190 if (copy_null)
2191 targetlen++;
2192 if (!target) {
2193 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2194 PyErr_NoMemory();
2195 return NULL;
2196 }
2197 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2198 if (!target) {
2199 PyErr_NoMemory();
2200 return NULL;
2201 }
2202 }
2203 else {
2204 if (targetsize < targetlen) {
2205 PyErr_Format(PyExc_SystemError,
2206 "string is longer than the buffer");
2207 if (copy_null && 0 < targetsize)
2208 target[0] = 0;
2209 return NULL;
2210 }
2211 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002212 if (kind == PyUnicode_1BYTE_KIND) {
2213 Py_UCS1 *start = (Py_UCS1 *) data;
2214 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002216 else if (kind == PyUnicode_2BYTE_KIND) {
2217 Py_UCS2 *start = (Py_UCS2 *) data;
2218 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2219 }
2220 else {
2221 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002223 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224 if (copy_null)
2225 target[len] = 0;
2226 return target;
2227}
2228
2229Py_UCS4*
2230PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2231 int copy_null)
2232{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002233 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 PyErr_BadInternalCall();
2235 return NULL;
2236 }
2237 return as_ucs4(string, target, targetsize, copy_null);
2238}
2239
2240Py_UCS4*
2241PyUnicode_AsUCS4Copy(PyObject *string)
2242{
2243 return as_ucs4(string, NULL, 0, 1);
2244}
2245
2246#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002247
Alexander Belopolsky40018472011-02-26 01:02:56 +00002248PyObject *
2249PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002252 if (size == 0) {
2253 Py_INCREF(unicode_empty);
2254 return unicode_empty;
2255 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002256 PyErr_BadInternalCall();
2257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 }
2259
Martin v. Löwis790465f2008-04-05 20:41:37 +00002260 if (size == -1) {
2261 size = wcslen(w);
2262 }
2263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265}
2266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002268
Walter Dörwald346737f2007-05-31 10:44:43 +00002269static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002270makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2271 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002272{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002273 *fmt++ = '%';
2274 if (width) {
2275 if (zeropad)
2276 *fmt++ = '0';
2277 fmt += sprintf(fmt, "%d", width);
2278 }
2279 if (precision)
2280 fmt += sprintf(fmt, ".%d", precision);
2281 if (longflag)
2282 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002283 else if (longlongflag) {
2284 /* longlongflag should only ever be nonzero on machines with
2285 HAVE_LONG_LONG defined */
2286#ifdef HAVE_LONG_LONG
2287 char *f = PY_FORMAT_LONG_LONG;
2288 while (*f)
2289 *fmt++ = *f++;
2290#else
2291 /* we shouldn't ever get here */
2292 assert(0);
2293 *fmt++ = 'l';
2294#endif
2295 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002296 else if (size_tflag) {
2297 char *f = PY_FORMAT_SIZE_T;
2298 while (*f)
2299 *fmt++ = *f++;
2300 }
2301 *fmt++ = c;
2302 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002303}
2304
Victor Stinner96865452011-03-01 23:44:09 +00002305/* helper for PyUnicode_FromFormatV() */
2306
2307static const char*
2308parse_format_flags(const char *f,
2309 int *p_width, int *p_precision,
2310 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2311{
2312 int width, precision, longflag, longlongflag, size_tflag;
2313
2314 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2315 f++;
2316 width = 0;
2317 while (Py_ISDIGIT((unsigned)*f))
2318 width = (width*10) + *f++ - '0';
2319 precision = 0;
2320 if (*f == '.') {
2321 f++;
2322 while (Py_ISDIGIT((unsigned)*f))
2323 precision = (precision*10) + *f++ - '0';
2324 if (*f == '%') {
2325 /* "%.3%s" => f points to "3" */
2326 f--;
2327 }
2328 }
2329 if (*f == '\0') {
2330 /* bogus format "%.1" => go backward, f points to "1" */
2331 f--;
2332 }
2333 if (p_width != NULL)
2334 *p_width = width;
2335 if (p_precision != NULL)
2336 *p_precision = precision;
2337
2338 /* Handle %ld, %lu, %lld and %llu. */
2339 longflag = 0;
2340 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002341 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002342
2343 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002344 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002345 longflag = 1;
2346 ++f;
2347 }
2348#ifdef HAVE_LONG_LONG
2349 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002350 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002351 longlongflag = 1;
2352 f += 2;
2353 }
2354#endif
2355 }
2356 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002357 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002358 size_tflag = 1;
2359 ++f;
2360 }
2361 if (p_longflag != NULL)
2362 *p_longflag = longflag;
2363 if (p_longlongflag != NULL)
2364 *p_longlongflag = longlongflag;
2365 if (p_size_tflag != NULL)
2366 *p_size_tflag = size_tflag;
2367 return f;
2368}
2369
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002370/* maximum number of characters required for output of %ld. 21 characters
2371 allows for 64-bit integers (in decimal) and an optional sign. */
2372#define MAX_LONG_CHARS 21
2373/* maximum number of characters required for output of %lld.
2374 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2375 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2376#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2377
Walter Dörwaldd2034312007-05-18 16:29:38 +00002378PyObject *
2379PyUnicode_FromFormatV(const char *format, va_list vargs)
2380{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002381 va_list count;
2382 Py_ssize_t callcount = 0;
2383 PyObject **callresults = NULL;
2384 PyObject **callresult = NULL;
2385 Py_ssize_t n = 0;
2386 int width = 0;
2387 int precision = 0;
2388 int zeropad;
2389 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002390 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002391 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002392 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2394 Py_UCS4 argmaxchar;
2395 Py_ssize_t numbersize = 0;
2396 char *numberresults = NULL;
2397 char *numberresult = NULL;
2398 Py_ssize_t i;
2399 int kind;
2400 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002401
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002402 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002403 /* step 1: count the number of %S/%R/%A/%s format specifications
2404 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2405 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002407 * also estimate a upper bound for all the number formats in the string,
2408 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002410 for (f = format; *f; f++) {
2411 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002412 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2414 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2415 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2416 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002418 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002419#ifdef HAVE_LONG_LONG
2420 if (longlongflag) {
2421 if (width < MAX_LONG_LONG_CHARS)
2422 width = MAX_LONG_LONG_CHARS;
2423 }
2424 else
2425#endif
2426 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2427 including sign. Decimal takes the most space. This
2428 isn't enough for octal. If a width is specified we
2429 need more (which we allocate later). */
2430 if (width < MAX_LONG_CHARS)
2431 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432
2433 /* account for the size + '\0' to separate numbers
2434 inside of the numberresults buffer */
2435 numbersize += (width + 1);
2436 }
2437 }
2438 else if ((unsigned char)*f > 127) {
2439 PyErr_Format(PyExc_ValueError,
2440 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2441 "string, got a non-ASCII byte: 0x%02x",
2442 (unsigned char)*f);
2443 return NULL;
2444 }
2445 }
2446 /* step 2: allocate memory for the results of
2447 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2448 if (callcount) {
2449 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2450 if (!callresults) {
2451 PyErr_NoMemory();
2452 return NULL;
2453 }
2454 callresult = callresults;
2455 }
2456 /* step 2.5: allocate memory for the results of formating numbers */
2457 if (numbersize) {
2458 numberresults = PyObject_Malloc(numbersize);
2459 if (!numberresults) {
2460 PyErr_NoMemory();
2461 goto fail;
2462 }
2463 numberresult = numberresults;
2464 }
2465
2466 /* step 3: format numbers and figure out how large a buffer we need */
2467 for (f = format; *f; f++) {
2468 if (*f == '%') {
2469 const char* p;
2470 int longflag;
2471 int longlongflag;
2472 int size_tflag;
2473 int numprinted;
2474
2475 p = f;
2476 zeropad = (f[1] == '0');
2477 f = parse_format_flags(f, &width, &precision,
2478 &longflag, &longlongflag, &size_tflag);
2479 switch (*f) {
2480 case 'c':
2481 {
2482 Py_UCS4 ordinal = va_arg(count, int);
Victor Stinnere6abb482012-05-02 01:15:40 +02002483 maxchar = MAX_MAXCHAR(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002484 n++;
2485 break;
2486 }
2487 case '%':
2488 n++;
2489 break;
2490 case 'i':
2491 case 'd':
2492 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2493 width, precision, *f);
2494 if (longflag)
2495 numprinted = sprintf(numberresult, fmt,
2496 va_arg(count, long));
2497#ifdef HAVE_LONG_LONG
2498 else if (longlongflag)
2499 numprinted = sprintf(numberresult, fmt,
2500 va_arg(count, PY_LONG_LONG));
2501#endif
2502 else if (size_tflag)
2503 numprinted = sprintf(numberresult, fmt,
2504 va_arg(count, Py_ssize_t));
2505 else
2506 numprinted = sprintf(numberresult, fmt,
2507 va_arg(count, int));
2508 n += numprinted;
2509 /* advance by +1 to skip over the '\0' */
2510 numberresult += (numprinted + 1);
2511 assert(*(numberresult - 1) == '\0');
2512 assert(*(numberresult - 2) != '\0');
2513 assert(numprinted >= 0);
2514 assert(numberresult <= numberresults + numbersize);
2515 break;
2516 case 'u':
2517 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2518 width, precision, 'u');
2519 if (longflag)
2520 numprinted = sprintf(numberresult, fmt,
2521 va_arg(count, unsigned long));
2522#ifdef HAVE_LONG_LONG
2523 else if (longlongflag)
2524 numprinted = sprintf(numberresult, fmt,
2525 va_arg(count, unsigned PY_LONG_LONG));
2526#endif
2527 else if (size_tflag)
2528 numprinted = sprintf(numberresult, fmt,
2529 va_arg(count, size_t));
2530 else
2531 numprinted = sprintf(numberresult, fmt,
2532 va_arg(count, unsigned int));
2533 n += numprinted;
2534 numberresult += (numprinted + 1);
2535 assert(*(numberresult - 1) == '\0');
2536 assert(*(numberresult - 2) != '\0');
2537 assert(numprinted >= 0);
2538 assert(numberresult <= numberresults + numbersize);
2539 break;
2540 case 'x':
2541 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2542 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2543 n += numprinted;
2544 numberresult += (numprinted + 1);
2545 assert(*(numberresult - 1) == '\0');
2546 assert(*(numberresult - 2) != '\0');
2547 assert(numprinted >= 0);
2548 assert(numberresult <= numberresults + numbersize);
2549 break;
2550 case 'p':
2551 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2552 /* %p is ill-defined: ensure leading 0x. */
2553 if (numberresult[1] == 'X')
2554 numberresult[1] = 'x';
2555 else if (numberresult[1] != 'x') {
2556 memmove(numberresult + 2, numberresult,
2557 strlen(numberresult) + 1);
2558 numberresult[0] = '0';
2559 numberresult[1] = 'x';
2560 numprinted += 2;
2561 }
2562 n += numprinted;
2563 numberresult += (numprinted + 1);
2564 assert(*(numberresult - 1) == '\0');
2565 assert(*(numberresult - 2) != '\0');
2566 assert(numprinted >= 0);
2567 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002568 break;
2569 case 's':
2570 {
2571 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002572 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002573 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002574 if (!str)
2575 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002576 /* since PyUnicode_DecodeUTF8 returns already flexible
2577 unicode objects, there is no need to call ready on them */
2578 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002579 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002580 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002581 /* Remember the str and switch to the next slot */
2582 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 break;
2584 }
2585 case 'U':
2586 {
2587 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002588 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 if (PyUnicode_READY(obj) == -1)
2590 goto fail;
2591 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002592 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002594 break;
2595 }
2596 case 'V':
2597 {
2598 PyObject *obj = va_arg(count, PyObject *);
2599 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002600 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002601 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002602 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002603 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 if (PyUnicode_READY(obj) == -1)
2605 goto fail;
2606 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002607 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002609 *callresult++ = NULL;
2610 }
2611 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002612 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002613 if (!str_obj)
2614 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002615 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002616 Py_DECREF(str_obj);
2617 goto fail;
2618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002620 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002621 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002622 *callresult++ = str_obj;
2623 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002624 break;
2625 }
2626 case 'S':
2627 {
2628 PyObject *obj = va_arg(count, PyObject *);
2629 PyObject *str;
2630 assert(obj);
2631 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002632 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002633 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002634 if (PyUnicode_READY(str) == -1) {
2635 Py_DECREF(str);
2636 goto fail;
2637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002638 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002639 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002640 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002641 /* Remember the str and switch to the next slot */
2642 *callresult++ = str;
2643 break;
2644 }
2645 case 'R':
2646 {
2647 PyObject *obj = va_arg(count, PyObject *);
2648 PyObject *repr;
2649 assert(obj);
2650 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002651 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002652 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002653 if (PyUnicode_READY(repr) == -1) {
2654 Py_DECREF(repr);
2655 goto fail;
2656 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Victor Stinnere6abb482012-05-02 01:15:40 +02002658 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002659 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002660 /* Remember the repr and switch to the next slot */
2661 *callresult++ = repr;
2662 break;
2663 }
2664 case 'A':
2665 {
2666 PyObject *obj = va_arg(count, PyObject *);
2667 PyObject *ascii;
2668 assert(obj);
2669 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002670 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002671 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002672 if (PyUnicode_READY(ascii) == -1) {
2673 Py_DECREF(ascii);
2674 goto fail;
2675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002676 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Victor Stinnere6abb482012-05-02 01:15:40 +02002677 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002678 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002679 /* Remember the repr and switch to the next slot */
2680 *callresult++ = ascii;
2681 break;
2682 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002683 default:
2684 /* if we stumble upon an unknown
2685 formatting code, copy the rest of
2686 the format string to the output
2687 string. (we cannot just skip the
2688 code, since there's no way to know
2689 what's in the argument list) */
2690 n += strlen(p);
2691 goto expand;
2692 }
2693 } else
2694 n++;
2695 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002696 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002697 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002698 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002699 we don't have to resize the string.
2700 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002701 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002702 if (!string)
2703 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002704 kind = PyUnicode_KIND(string);
2705 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002709 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002711 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002712
2713 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002714 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2715 /* checking for == because the last argument could be a empty
2716 string, which causes i to point to end, the assert at the end of
2717 the loop */
2718 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002719
Benjamin Peterson14339b62009-01-31 16:36:08 +00002720 switch (*f) {
2721 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002722 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002723 const int ordinal = va_arg(vargs, int);
2724 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002725 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002726 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002727 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002728 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002729 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002730 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002731 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002732 {
2733 Py_ssize_t written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002734 /* unused, since we already have the result */
2735 if (*f == 'p')
2736 (void) va_arg(vargs, void *);
2737 else
2738 (void) va_arg(vargs, int);
2739 /* extract the result from numberresults and append. */
Victor Stinnerc5166102012-02-22 13:55:02 +01002740 written = unicode_write_cstr(string, i, numberresult);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 /* skip over the separating '\0' */
Victor Stinnerc5166102012-02-22 13:55:02 +01002742 i += written;
2743 numberresult += written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002744 assert(*numberresult == '\0');
2745 numberresult++;
2746 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002747 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002748 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002749 case 's':
2750 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002751 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002752 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002753 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002754 size = PyUnicode_GET_LENGTH(*callresult);
2755 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002756 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002757 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002758 /* We're done with the unicode()/repr() => forget it */
2759 Py_DECREF(*callresult);
2760 /* switch to next unicode()/repr() result */
2761 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002762 break;
2763 }
2764 case 'U':
2765 {
2766 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002767 Py_ssize_t size;
2768 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2769 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002770 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002771 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002772 break;
2773 }
2774 case 'V':
2775 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002776 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002777 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002778 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002779 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780 size = PyUnicode_GET_LENGTH(obj);
2781 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002782 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002783 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002785 size = PyUnicode_GET_LENGTH(*callresult);
2786 assert(PyUnicode_KIND(*callresult) <=
2787 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002788 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002789 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002790 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002791 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002792 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002793 break;
2794 }
2795 case 'S':
2796 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002797 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002798 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002799 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002800 /* unused, since we already have the result */
2801 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002802 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002803 copy_characters(string, i, *callresult, 0, size);
2804 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002805 /* We're done with the unicode()/repr() => forget it */
2806 Py_DECREF(*callresult);
2807 /* switch to next unicode()/repr() result */
2808 ++callresult;
2809 break;
2810 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002811 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002812 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002813 break;
2814 default:
Victor Stinnerc5166102012-02-22 13:55:02 +01002815 i += unicode_write_cstr(string, i, p);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002816 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002817 goto end;
2818 }
Victor Stinner1205f272010-09-11 00:54:47 +00002819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002820 else {
2821 assert(i < PyUnicode_GET_LENGTH(string));
2822 PyUnicode_WRITE(kind, data, i++, *f);
2823 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002824 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002825 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002826
Benjamin Peterson29060642009-01-31 22:14:21 +00002827 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002828 if (callresults)
2829 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002830 if (numberresults)
2831 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002832 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002833 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002834 if (callresults) {
2835 PyObject **callresult2 = callresults;
2836 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002837 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002838 ++callresult2;
2839 }
2840 PyObject_Free(callresults);
2841 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002842 if (numberresults)
2843 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002844 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002845}
2846
Walter Dörwaldd2034312007-05-18 16:29:38 +00002847PyObject *
2848PyUnicode_FromFormat(const char *format, ...)
2849{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002850 PyObject* ret;
2851 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002852
2853#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002854 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002855#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002856 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002857#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002858 ret = PyUnicode_FromFormatV(format, vargs);
2859 va_end(vargs);
2860 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002861}
2862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002863#ifdef HAVE_WCHAR_H
2864
Victor Stinner5593d8a2010-10-02 11:11:27 +00002865/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2866 convert a Unicode object to a wide character string.
2867
Victor Stinnerd88d9832011-09-06 02:00:05 +02002868 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002869 character) required to convert the unicode object. Ignore size argument.
2870
Victor Stinnerd88d9832011-09-06 02:00:05 +02002871 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002872 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002873 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002874static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002875unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002876 wchar_t *w,
2877 Py_ssize_t size)
2878{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002879 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002880 const wchar_t *wstr;
2881
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002882 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002883 if (wstr == NULL)
2884 return -1;
2885
Victor Stinner5593d8a2010-10-02 11:11:27 +00002886 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002887 if (size > res)
2888 size = res + 1;
2889 else
2890 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002891 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002892 return res;
2893 }
2894 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002895 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002896}
2897
2898Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002899PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002900 wchar_t *w,
2901 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902{
2903 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002904 PyErr_BadInternalCall();
2905 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002907 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002908}
2909
Victor Stinner137c34c2010-09-29 10:25:54 +00002910wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002911PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002912 Py_ssize_t *size)
2913{
2914 wchar_t* buffer;
2915 Py_ssize_t buflen;
2916
2917 if (unicode == NULL) {
2918 PyErr_BadInternalCall();
2919 return NULL;
2920 }
2921
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002922 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002923 if (buflen == -1)
2924 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002925 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002926 PyErr_NoMemory();
2927 return NULL;
2928 }
2929
Victor Stinner137c34c2010-09-29 10:25:54 +00002930 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2931 if (buffer == NULL) {
2932 PyErr_NoMemory();
2933 return NULL;
2934 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002935 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002936 if (buflen == -1)
2937 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002938 if (size != NULL)
2939 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002940 return buffer;
2941}
2942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002943#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944
Alexander Belopolsky40018472011-02-26 01:02:56 +00002945PyObject *
2946PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002947{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002948 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002949 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002950 PyErr_SetString(PyExc_ValueError,
2951 "chr() arg not in range(0x110000)");
2952 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002953 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002955 if (ordinal < 256)
2956 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002958 v = PyUnicode_New(1, ordinal);
2959 if (v == NULL)
2960 return NULL;
2961 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002962 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002963 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002964}
2965
Alexander Belopolsky40018472011-02-26 01:02:56 +00002966PyObject *
2967PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002969 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002970 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002971 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002972 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002973 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002974 Py_INCREF(obj);
2975 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002976 }
2977 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 /* For a Unicode subtype that's not a Unicode object,
2979 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002980 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002981 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002982 PyErr_Format(PyExc_TypeError,
2983 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002984 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002985 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002986}
2987
Alexander Belopolsky40018472011-02-26 01:02:56 +00002988PyObject *
2989PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002990 const char *encoding,
2991 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002992{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002993 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002994 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002995
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002997 PyErr_BadInternalCall();
2998 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003000
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003001 /* Decoding bytes objects is the most common case and should be fast */
3002 if (PyBytes_Check(obj)) {
3003 if (PyBytes_GET_SIZE(obj) == 0) {
3004 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003005 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003006 }
3007 else {
3008 v = PyUnicode_Decode(
3009 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3010 encoding, errors);
3011 }
3012 return v;
3013 }
3014
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003015 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003016 PyErr_SetString(PyExc_TypeError,
3017 "decoding str is not supported");
3018 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003019 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003020
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003021 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3022 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3023 PyErr_Format(PyExc_TypeError,
3024 "coercing to str: need bytes, bytearray "
3025 "or buffer-like object, %.80s found",
3026 Py_TYPE(obj)->tp_name);
3027 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003028 }
Tim Petersced69f82003-09-16 20:30:58 +00003029
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003030 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003031 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003032 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 }
Tim Petersced69f82003-09-16 20:30:58 +00003034 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003035 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003036
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003037 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003038 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039}
3040
Victor Stinner600d3be2010-06-10 12:00:55 +00003041/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003042 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3043 1 on success. */
3044static int
3045normalize_encoding(const char *encoding,
3046 char *lower,
3047 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003049 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003050 char *l;
3051 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003052
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003053 if (encoding == NULL) {
3054 strcpy(lower, "utf-8");
3055 return 1;
3056 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003057 e = encoding;
3058 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003059 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003060 while (*e) {
3061 if (l == l_end)
3062 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003063 if (Py_ISUPPER(*e)) {
3064 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003065 }
3066 else if (*e == '_') {
3067 *l++ = '-';
3068 e++;
3069 }
3070 else {
3071 *l++ = *e++;
3072 }
3073 }
3074 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003075 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003076}
3077
Alexander Belopolsky40018472011-02-26 01:02:56 +00003078PyObject *
3079PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003080 Py_ssize_t size,
3081 const char *encoding,
3082 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003083{
3084 PyObject *buffer = NULL, *unicode;
3085 Py_buffer info;
3086 char lower[11]; /* Enough for any encoding shortcut */
3087
Fred Drakee4315f52000-05-09 19:53:39 +00003088 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003089 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003090 if ((strcmp(lower, "utf-8") == 0) ||
3091 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003092 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003093 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003094 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003095 (strcmp(lower, "iso-8859-1") == 0))
3096 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003097#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003098 else if (strcmp(lower, "mbcs") == 0)
3099 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003100#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003101 else if (strcmp(lower, "ascii") == 0)
3102 return PyUnicode_DecodeASCII(s, size, errors);
3103 else if (strcmp(lower, "utf-16") == 0)
3104 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3105 else if (strcmp(lower, "utf-32") == 0)
3106 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108
3109 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003110 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003111 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003112 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003113 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114 if (buffer == NULL)
3115 goto onError;
3116 unicode = PyCodec_Decode(buffer, encoding, errors);
3117 if (unicode == NULL)
3118 goto onError;
3119 if (!PyUnicode_Check(unicode)) {
3120 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003121 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003122 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123 Py_DECREF(unicode);
3124 goto onError;
3125 }
3126 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003127 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003128
Benjamin Peterson29060642009-01-31 22:14:21 +00003129 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 Py_XDECREF(buffer);
3131 return NULL;
3132}
3133
Alexander Belopolsky40018472011-02-26 01:02:56 +00003134PyObject *
3135PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003136 const char *encoding,
3137 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003138{
3139 PyObject *v;
3140
3141 if (!PyUnicode_Check(unicode)) {
3142 PyErr_BadArgument();
3143 goto onError;
3144 }
3145
3146 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003147 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003148
3149 /* Decode via the codec registry */
3150 v = PyCodec_Decode(unicode, encoding, errors);
3151 if (v == NULL)
3152 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003153 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003154
Benjamin Peterson29060642009-01-31 22:14:21 +00003155 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003156 return NULL;
3157}
3158
Alexander Belopolsky40018472011-02-26 01:02:56 +00003159PyObject *
3160PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003161 const char *encoding,
3162 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003163{
3164 PyObject *v;
3165
3166 if (!PyUnicode_Check(unicode)) {
3167 PyErr_BadArgument();
3168 goto onError;
3169 }
3170
3171 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003172 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003173
3174 /* Decode via the codec registry */
3175 v = PyCodec_Decode(unicode, encoding, errors);
3176 if (v == NULL)
3177 goto onError;
3178 if (!PyUnicode_Check(v)) {
3179 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003180 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003181 Py_TYPE(v)->tp_name);
3182 Py_DECREF(v);
3183 goto onError;
3184 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003185 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003186
Benjamin Peterson29060642009-01-31 22:14:21 +00003187 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003188 return NULL;
3189}
3190
Alexander Belopolsky40018472011-02-26 01:02:56 +00003191PyObject *
3192PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003193 Py_ssize_t size,
3194 const char *encoding,
3195 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196{
3197 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003198
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 unicode = PyUnicode_FromUnicode(s, size);
3200 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003201 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3203 Py_DECREF(unicode);
3204 return v;
3205}
3206
Alexander Belopolsky40018472011-02-26 01:02:56 +00003207PyObject *
3208PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003209 const char *encoding,
3210 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003211{
3212 PyObject *v;
3213
3214 if (!PyUnicode_Check(unicode)) {
3215 PyErr_BadArgument();
3216 goto onError;
3217 }
3218
3219 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003220 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003221
3222 /* Encode via the codec registry */
3223 v = PyCodec_Encode(unicode, encoding, errors);
3224 if (v == NULL)
3225 goto onError;
3226 return v;
3227
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003229 return NULL;
3230}
3231
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003232static size_t
3233wcstombs_errorpos(const wchar_t *wstr)
3234{
3235 size_t len;
3236#if SIZEOF_WCHAR_T == 2
3237 wchar_t buf[3];
3238#else
3239 wchar_t buf[2];
3240#endif
3241 char outbuf[MB_LEN_MAX];
3242 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003243
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003244#if SIZEOF_WCHAR_T == 2
3245 buf[2] = 0;
3246#else
3247 buf[1] = 0;
3248#endif
3249 start = wstr;
3250 while (*wstr != L'\0')
3251 {
3252 previous = wstr;
3253#if SIZEOF_WCHAR_T == 2
3254 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3255 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3256 {
3257 buf[0] = wstr[0];
3258 buf[1] = wstr[1];
3259 wstr += 2;
3260 }
3261 else {
3262 buf[0] = *wstr;
3263 buf[1] = 0;
3264 wstr++;
3265 }
3266#else
3267 buf[0] = *wstr;
3268 wstr++;
3269#endif
3270 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003271 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003272 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003273 }
3274
3275 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003276 return 0;
3277}
3278
Victor Stinner1b579672011-12-17 05:47:23 +01003279static int
3280locale_error_handler(const char *errors, int *surrogateescape)
3281{
3282 if (errors == NULL) {
3283 *surrogateescape = 0;
3284 return 0;
3285 }
3286
3287 if (strcmp(errors, "strict") == 0) {
3288 *surrogateescape = 0;
3289 return 0;
3290 }
3291 if (strcmp(errors, "surrogateescape") == 0) {
3292 *surrogateescape = 1;
3293 return 0;
3294 }
3295 PyErr_Format(PyExc_ValueError,
3296 "only 'strict' and 'surrogateescape' error handlers "
3297 "are supported, not '%s'",
3298 errors);
3299 return -1;
3300}
3301
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003302PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003303PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003304{
3305 Py_ssize_t wlen, wlen2;
3306 wchar_t *wstr;
3307 PyObject *bytes = NULL;
3308 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003309 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003310 PyObject *exc;
3311 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003312 int surrogateescape;
3313
3314 if (locale_error_handler(errors, &surrogateescape) < 0)
3315 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003316
3317 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3318 if (wstr == NULL)
3319 return NULL;
3320
3321 wlen2 = wcslen(wstr);
3322 if (wlen2 != wlen) {
3323 PyMem_Free(wstr);
3324 PyErr_SetString(PyExc_TypeError, "embedded null character");
3325 return NULL;
3326 }
3327
3328 if (surrogateescape) {
3329 /* locale encoding with surrogateescape */
3330 char *str;
3331
3332 str = _Py_wchar2char(wstr, &error_pos);
3333 if (str == NULL) {
3334 if (error_pos == (size_t)-1) {
3335 PyErr_NoMemory();
3336 PyMem_Free(wstr);
3337 return NULL;
3338 }
3339 else {
3340 goto encode_error;
3341 }
3342 }
3343 PyMem_Free(wstr);
3344
3345 bytes = PyBytes_FromString(str);
3346 PyMem_Free(str);
3347 }
3348 else {
3349 size_t len, len2;
3350
3351 len = wcstombs(NULL, wstr, 0);
3352 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003353 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003354 goto encode_error;
3355 }
3356
3357 bytes = PyBytes_FromStringAndSize(NULL, len);
3358 if (bytes == NULL) {
3359 PyMem_Free(wstr);
3360 return NULL;
3361 }
3362
3363 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3364 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003365 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003366 goto encode_error;
3367 }
3368 PyMem_Free(wstr);
3369 }
3370 return bytes;
3371
3372encode_error:
3373 errmsg = strerror(errno);
3374 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003375
3376 if (error_pos == (size_t)-1)
3377 error_pos = wcstombs_errorpos(wstr);
3378
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003379 PyMem_Free(wstr);
3380 Py_XDECREF(bytes);
3381
Victor Stinner2f197072011-12-17 07:08:30 +01003382 if (errmsg != NULL) {
3383 size_t errlen;
3384 wstr = _Py_char2wchar(errmsg, &errlen);
3385 if (wstr != NULL) {
3386 reason = PyUnicode_FromWideChar(wstr, errlen);
3387 PyMem_Free(wstr);
3388 } else
3389 errmsg = NULL;
3390 }
3391 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003392 reason = PyUnicode_FromString(
3393 "wcstombs() encountered an unencodable "
3394 "wide character");
3395 if (reason == NULL)
3396 return NULL;
3397
3398 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3399 "locale", unicode,
3400 (Py_ssize_t)error_pos,
3401 (Py_ssize_t)(error_pos+1),
3402 reason);
3403 Py_DECREF(reason);
3404 if (exc != NULL) {
3405 PyCodec_StrictErrors(exc);
3406 Py_XDECREF(exc);
3407 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003408 return NULL;
3409}
3410
Victor Stinnerad158722010-10-27 00:25:46 +00003411PyObject *
3412PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003413{
Victor Stinner99b95382011-07-04 14:23:54 +02003414#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003415 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003416#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003417 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003418#else
Victor Stinner793b5312011-04-27 00:24:21 +02003419 PyInterpreterState *interp = PyThreadState_GET()->interp;
3420 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3421 cannot use it to encode and decode filenames before it is loaded. Load
3422 the Python codec requires to encode at least its own filename. Use the C
3423 version of the locale codec until the codec registry is initialized and
3424 the Python codec is loaded.
3425
3426 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3427 cannot only rely on it: check also interp->fscodec_initialized for
3428 subinterpreters. */
3429 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003430 return PyUnicode_AsEncodedString(unicode,
3431 Py_FileSystemDefaultEncoding,
3432 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003433 }
3434 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003435 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003436 }
Victor Stinnerad158722010-10-27 00:25:46 +00003437#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003438}
3439
Alexander Belopolsky40018472011-02-26 01:02:56 +00003440PyObject *
3441PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003442 const char *encoding,
3443 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444{
3445 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003446 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003447
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448 if (!PyUnicode_Check(unicode)) {
3449 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003450 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451 }
Fred Drakee4315f52000-05-09 19:53:39 +00003452
Fred Drakee4315f52000-05-09 19:53:39 +00003453 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003454 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003455 if ((strcmp(lower, "utf-8") == 0) ||
3456 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003457 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003458 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003459 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003460 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003461 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003462 }
Victor Stinner37296e82010-06-10 13:36:23 +00003463 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003464 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003465 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003466 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003467#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003468 else if (strcmp(lower, "mbcs") == 0)
3469 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003470#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003471 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003472 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003473 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003474
3475 /* Encode via the codec registry */
3476 v = PyCodec_Encode(unicode, encoding, errors);
3477 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003478 return NULL;
3479
3480 /* The normal path */
3481 if (PyBytes_Check(v))
3482 return v;
3483
3484 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003485 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003486 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003487 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003488
3489 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3490 "encoder %s returned bytearray instead of bytes",
3491 encoding);
3492 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003493 Py_DECREF(v);
3494 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003495 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003496
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003497 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3498 Py_DECREF(v);
3499 return b;
3500 }
3501
3502 PyErr_Format(PyExc_TypeError,
3503 "encoder did not return a bytes object (type=%.400s)",
3504 Py_TYPE(v)->tp_name);
3505 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003506 return NULL;
3507}
3508
Alexander Belopolsky40018472011-02-26 01:02:56 +00003509PyObject *
3510PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003511 const char *encoding,
3512 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003513{
3514 PyObject *v;
3515
3516 if (!PyUnicode_Check(unicode)) {
3517 PyErr_BadArgument();
3518 goto onError;
3519 }
3520
3521 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003522 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003523
3524 /* Encode via the codec registry */
3525 v = PyCodec_Encode(unicode, encoding, errors);
3526 if (v == NULL)
3527 goto onError;
3528 if (!PyUnicode_Check(v)) {
3529 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003530 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003531 Py_TYPE(v)->tp_name);
3532 Py_DECREF(v);
3533 goto onError;
3534 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003536
Benjamin Peterson29060642009-01-31 22:14:21 +00003537 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538 return NULL;
3539}
3540
Victor Stinner2f197072011-12-17 07:08:30 +01003541static size_t
3542mbstowcs_errorpos(const char *str, size_t len)
3543{
3544#ifdef HAVE_MBRTOWC
3545 const char *start = str;
3546 mbstate_t mbs;
3547 size_t converted;
3548 wchar_t ch;
3549
3550 memset(&mbs, 0, sizeof mbs);
3551 while (len)
3552 {
3553 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3554 if (converted == 0)
3555 /* Reached end of string */
3556 break;
3557 if (converted == (size_t)-1 || converted == (size_t)-2) {
3558 /* Conversion error or incomplete character */
3559 return str - start;
3560 }
3561 else {
3562 str += converted;
3563 len -= converted;
3564 }
3565 }
3566 /* failed to find the undecodable byte sequence */
3567 return 0;
3568#endif
3569 return 0;
3570}
3571
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003572PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003573PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003574 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003575{
3576 wchar_t smallbuf[256];
3577 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3578 wchar_t *wstr;
3579 size_t wlen, wlen2;
3580 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003581 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003582 size_t error_pos;
3583 char *errmsg;
3584 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003585
3586 if (locale_error_handler(errors, &surrogateescape) < 0)
3587 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003588
3589 if (str[len] != '\0' || len != strlen(str)) {
3590 PyErr_SetString(PyExc_TypeError, "embedded null character");
3591 return NULL;
3592 }
3593
3594 if (surrogateescape)
3595 {
3596 wstr = _Py_char2wchar(str, &wlen);
3597 if (wstr == NULL) {
3598 if (wlen == (size_t)-1)
3599 PyErr_NoMemory();
3600 else
3601 PyErr_SetFromErrno(PyExc_OSError);
3602 return NULL;
3603 }
3604
3605 unicode = PyUnicode_FromWideChar(wstr, wlen);
3606 PyMem_Free(wstr);
3607 }
3608 else {
3609#ifndef HAVE_BROKEN_MBSTOWCS
3610 wlen = mbstowcs(NULL, str, 0);
3611#else
3612 wlen = len;
3613#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003614 if (wlen == (size_t)-1)
3615 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003616 if (wlen+1 <= smallbuf_len) {
3617 wstr = smallbuf;
3618 }
3619 else {
3620 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3621 return PyErr_NoMemory();
3622
3623 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3624 if (!wstr)
3625 return PyErr_NoMemory();
3626 }
3627
3628 /* This shouldn't fail now */
3629 wlen2 = mbstowcs(wstr, str, wlen+1);
3630 if (wlen2 == (size_t)-1) {
3631 if (wstr != smallbuf)
3632 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003633 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003634 }
3635#ifdef HAVE_BROKEN_MBSTOWCS
3636 assert(wlen2 == wlen);
3637#endif
3638 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3639 if (wstr != smallbuf)
3640 PyMem_Free(wstr);
3641 }
3642 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003643
3644decode_error:
3645 errmsg = strerror(errno);
3646 assert(errmsg != NULL);
3647
3648 error_pos = mbstowcs_errorpos(str, len);
3649 if (errmsg != NULL) {
3650 size_t errlen;
3651 wstr = _Py_char2wchar(errmsg, &errlen);
3652 if (wstr != NULL) {
3653 reason = PyUnicode_FromWideChar(wstr, errlen);
3654 PyMem_Free(wstr);
3655 } else
3656 errmsg = NULL;
3657 }
3658 if (errmsg == NULL)
3659 reason = PyUnicode_FromString(
3660 "mbstowcs() encountered an invalid multibyte sequence");
3661 if (reason == NULL)
3662 return NULL;
3663
3664 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3665 "locale", str, len,
3666 (Py_ssize_t)error_pos,
3667 (Py_ssize_t)(error_pos+1),
3668 reason);
3669 Py_DECREF(reason);
3670 if (exc != NULL) {
3671 PyCodec_StrictErrors(exc);
3672 Py_XDECREF(exc);
3673 }
3674 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003675}
3676
3677PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003678PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003679{
3680 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003681 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003682}
3683
3684
3685PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003686PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003687 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003688 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3689}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003690
Christian Heimes5894ba72007-11-04 11:43:14 +00003691PyObject*
3692PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3693{
Victor Stinner99b95382011-07-04 14:23:54 +02003694#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003695 return PyUnicode_DecodeMBCS(s, size, NULL);
3696#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003697 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003698#else
Victor Stinner793b5312011-04-27 00:24:21 +02003699 PyInterpreterState *interp = PyThreadState_GET()->interp;
3700 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3701 cannot use it to encode and decode filenames before it is loaded. Load
3702 the Python codec requires to encode at least its own filename. Use the C
3703 version of the locale codec until the codec registry is initialized and
3704 the Python codec is loaded.
3705
3706 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3707 cannot only rely on it: check also interp->fscodec_initialized for
3708 subinterpreters. */
3709 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003710 return PyUnicode_Decode(s, size,
3711 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003712 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003713 }
3714 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003715 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003716 }
Victor Stinnerad158722010-10-27 00:25:46 +00003717#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003718}
3719
Martin v. Löwis011e8422009-05-05 04:43:17 +00003720
3721int
Antoine Pitrou13348842012-01-29 18:36:34 +01003722_PyUnicode_HasNULChars(PyObject* s)
3723{
3724 static PyObject *nul = NULL;
3725
3726 if (nul == NULL)
3727 nul = PyUnicode_FromStringAndSize("\0", 1);
3728 if (nul == NULL)
3729 return -1;
3730 return PyUnicode_Contains(s, nul);
3731}
3732
3733
3734int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003735PyUnicode_FSConverter(PyObject* arg, void* addr)
3736{
3737 PyObject *output = NULL;
3738 Py_ssize_t size;
3739 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003740 if (arg == NULL) {
3741 Py_DECREF(*(PyObject**)addr);
3742 return 1;
3743 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003744 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003745 output = arg;
3746 Py_INCREF(output);
3747 }
3748 else {
3749 arg = PyUnicode_FromObject(arg);
3750 if (!arg)
3751 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003752 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003753 Py_DECREF(arg);
3754 if (!output)
3755 return 0;
3756 if (!PyBytes_Check(output)) {
3757 Py_DECREF(output);
3758 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3759 return 0;
3760 }
3761 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003762 size = PyBytes_GET_SIZE(output);
3763 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003764 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003765 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003766 Py_DECREF(output);
3767 return 0;
3768 }
3769 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003770 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003771}
3772
3773
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003774int
3775PyUnicode_FSDecoder(PyObject* arg, void* addr)
3776{
3777 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003778 if (arg == NULL) {
3779 Py_DECREF(*(PyObject**)addr);
3780 return 1;
3781 }
3782 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003783 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003784 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003785 output = arg;
3786 Py_INCREF(output);
3787 }
3788 else {
3789 arg = PyBytes_FromObject(arg);
3790 if (!arg)
3791 return 0;
3792 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3793 PyBytes_GET_SIZE(arg));
3794 Py_DECREF(arg);
3795 if (!output)
3796 return 0;
3797 if (!PyUnicode_Check(output)) {
3798 Py_DECREF(output);
3799 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3800 return 0;
3801 }
3802 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003803 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003804 Py_DECREF(output);
3805 return 0;
3806 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003807 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003808 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003809 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3810 Py_DECREF(output);
3811 return 0;
3812 }
3813 *(PyObject**)addr = output;
3814 return Py_CLEANUP_SUPPORTED;
3815}
3816
3817
Martin v. Löwis5b222132007-06-10 09:51:05 +00003818char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003819PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003820{
Christian Heimesf3863112007-11-22 07:46:41 +00003821 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003822
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003823 if (!PyUnicode_Check(unicode)) {
3824 PyErr_BadArgument();
3825 return NULL;
3826 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003827 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003828 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003830 if (PyUnicode_UTF8(unicode) == NULL) {
3831 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003832 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3833 if (bytes == NULL)
3834 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003835 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3836 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003837 Py_DECREF(bytes);
3838 return NULL;
3839 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003840 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3841 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3842 PyBytes_AS_STRING(bytes),
3843 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003844 Py_DECREF(bytes);
3845 }
3846
3847 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003848 *psize = PyUnicode_UTF8_LENGTH(unicode);
3849 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003850}
3851
3852char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003853PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003854{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003855 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3856}
3857
3858#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003859static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003860#endif
3861
3862
3863Py_UNICODE *
3864PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3865{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866 const unsigned char *one_byte;
3867#if SIZEOF_WCHAR_T == 4
3868 const Py_UCS2 *two_bytes;
3869#else
3870 const Py_UCS4 *four_bytes;
3871 const Py_UCS4 *ucs4_end;
3872 Py_ssize_t num_surrogates;
3873#endif
3874 wchar_t *w;
3875 wchar_t *wchar_end;
3876
3877 if (!PyUnicode_Check(unicode)) {
3878 PyErr_BadArgument();
3879 return NULL;
3880 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003881 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003882 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003883 assert(_PyUnicode_KIND(unicode) != 0);
3884 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885
3886#ifdef Py_DEBUG
3887 ++unicode_as_unicode_calls;
3888#endif
3889
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003890 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003891#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003892 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3893 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003894 num_surrogates = 0;
3895
3896 for (; four_bytes < ucs4_end; ++four_bytes) {
3897 if (*four_bytes > 0xFFFF)
3898 ++num_surrogates;
3899 }
3900
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003901 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3902 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3903 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003904 PyErr_NoMemory();
3905 return NULL;
3906 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003907 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003909 w = _PyUnicode_WSTR(unicode);
3910 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3911 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003912 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3913 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003914 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003916 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3917 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003918 }
3919 else
3920 *w = *four_bytes;
3921
3922 if (w > wchar_end) {
3923 assert(0 && "Miscalculated string end");
3924 }
3925 }
3926 *w = 0;
3927#else
3928 /* sizeof(wchar_t) == 4 */
3929 Py_FatalError("Impossible unicode object state, wstr and str "
3930 "should share memory already.");
3931 return NULL;
3932#endif
3933 }
3934 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003935 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3936 (_PyUnicode_LENGTH(unicode) + 1));
3937 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003938 PyErr_NoMemory();
3939 return NULL;
3940 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003941 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3942 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3943 w = _PyUnicode_WSTR(unicode);
3944 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003945
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003946 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3947 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 for (; w < wchar_end; ++one_byte, ++w)
3949 *w = *one_byte;
3950 /* null-terminate the wstr */
3951 *w = 0;
3952 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003953 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003954#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003955 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003956 for (; w < wchar_end; ++two_bytes, ++w)
3957 *w = *two_bytes;
3958 /* null-terminate the wstr */
3959 *w = 0;
3960#else
3961 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003962 PyObject_FREE(_PyUnicode_WSTR(unicode));
3963 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964 Py_FatalError("Impossible unicode object state, wstr "
3965 "and str should share memory already.");
3966 return NULL;
3967#endif
3968 }
3969 else {
3970 assert(0 && "This should never happen.");
3971 }
3972 }
3973 }
3974 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003975 *size = PyUnicode_WSTR_LENGTH(unicode);
3976 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003977}
3978
Alexander Belopolsky40018472011-02-26 01:02:56 +00003979Py_UNICODE *
3980PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003982 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983}
3984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003985
Alexander Belopolsky40018472011-02-26 01:02:56 +00003986Py_ssize_t
3987PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988{
3989 if (!PyUnicode_Check(unicode)) {
3990 PyErr_BadArgument();
3991 goto onError;
3992 }
3993 return PyUnicode_GET_SIZE(unicode);
3994
Benjamin Peterson29060642009-01-31 22:14:21 +00003995 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 return -1;
3997}
3998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003999Py_ssize_t
4000PyUnicode_GetLength(PyObject *unicode)
4001{
Victor Stinner5a706cf2011-10-02 00:36:53 +02004002 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003 PyErr_BadArgument();
4004 return -1;
4005 }
4006
4007 return PyUnicode_GET_LENGTH(unicode);
4008}
4009
4010Py_UCS4
4011PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4012{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004013 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4014 PyErr_BadArgument();
4015 return (Py_UCS4)-1;
4016 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004017 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004018 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 return (Py_UCS4)-1;
4020 }
4021 return PyUnicode_READ_CHAR(unicode, index);
4022}
4023
4024int
4025PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4026{
4027 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004028 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004029 return -1;
4030 }
Victor Stinner488fa492011-12-12 00:01:39 +01004031 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004032 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004033 PyErr_SetString(PyExc_IndexError, "string index out of range");
4034 return -1;
4035 }
Victor Stinner488fa492011-12-12 00:01:39 +01004036 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004037 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004038 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4039 PyErr_SetString(PyExc_ValueError, "character out of range");
4040 return -1;
4041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4043 index, ch);
4044 return 0;
4045}
4046
Alexander Belopolsky40018472011-02-26 01:02:56 +00004047const char *
4048PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004049{
Victor Stinner42cb4622010-09-01 19:39:01 +00004050 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004051}
4052
Victor Stinner554f3f02010-06-16 23:33:54 +00004053/* create or adjust a UnicodeDecodeError */
4054static void
4055make_decode_exception(PyObject **exceptionObject,
4056 const char *encoding,
4057 const char *input, Py_ssize_t length,
4058 Py_ssize_t startpos, Py_ssize_t endpos,
4059 const char *reason)
4060{
4061 if (*exceptionObject == NULL) {
4062 *exceptionObject = PyUnicodeDecodeError_Create(
4063 encoding, input, length, startpos, endpos, reason);
4064 }
4065 else {
4066 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4067 goto onError;
4068 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4069 goto onError;
4070 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4071 goto onError;
4072 }
4073 return;
4074
4075onError:
4076 Py_DECREF(*exceptionObject);
4077 *exceptionObject = NULL;
4078}
4079
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080/* error handling callback helper:
4081 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004082 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083 and adjust various state variables.
4084 return 0 on success, -1 on error
4085*/
4086
Alexander Belopolsky40018472011-02-26 01:02:56 +00004087static int
4088unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004089 const char *encoding, const char *reason,
4090 const char **input, const char **inend, Py_ssize_t *startinpos,
4091 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004092 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004093{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004094 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004095
4096 PyObject *restuple = NULL;
4097 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004098 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004099 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004100 Py_ssize_t requiredsize;
4101 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004102 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004103 int res = -1;
4104
Victor Stinner596a6c42011-11-09 00:02:18 +01004105 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4106 outsize = PyUnicode_GET_LENGTH(*output);
4107 else
4108 outsize = _PyUnicode_WSTR_LENGTH(*output);
4109
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 *errorHandler = PyCodec_LookupError(errors);
4112 if (*errorHandler == NULL)
4113 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 }
4115
Victor Stinner554f3f02010-06-16 23:33:54 +00004116 make_decode_exception(exceptionObject,
4117 encoding,
4118 *input, *inend - *input,
4119 *startinpos, *endinpos,
4120 reason);
4121 if (*exceptionObject == NULL)
4122 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123
4124 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4125 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004126 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004128 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130 }
4131 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004133 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004134 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004135
4136 /* Copy back the bytes variables, which might have been modified by the
4137 callback */
4138 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4139 if (!inputobj)
4140 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004141 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004142 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004143 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004144 *input = PyBytes_AS_STRING(inputobj);
4145 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004146 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004147 /* we can DECREF safely, as the exception has another reference,
4148 so the object won't go away. */
4149 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004151 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004152 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004153 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004154 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4155 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004156 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157
Victor Stinner596a6c42011-11-09 00:02:18 +01004158 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4159 /* need more space? (at least enough for what we
4160 have+the replacement+the rest of the string (starting
4161 at the new input position), so we won't have to check space
4162 when there are no errors in the rest of the string) */
4163 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4164 requiredsize = *outpos + replen + insize-newpos;
4165 if (requiredsize > outsize) {
4166 if (requiredsize<2*outsize)
4167 requiredsize = 2*outsize;
4168 if (unicode_resize(output, requiredsize) < 0)
4169 goto onError;
4170 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004171 if (unicode_widen(output, *outpos,
4172 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004173 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004174 copy_characters(*output, *outpos, repunicode, 0, replen);
4175 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004177 else {
4178 wchar_t *repwstr;
4179 Py_ssize_t repwlen;
4180 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4181 if (repwstr == NULL)
4182 goto onError;
4183 /* need more space? (at least enough for what we
4184 have+the replacement+the rest of the string (starting
4185 at the new input position), so we won't have to check space
4186 when there are no errors in the rest of the string) */
4187 requiredsize = *outpos + repwlen + insize-newpos;
4188 if (requiredsize > outsize) {
4189 if (requiredsize < 2*outsize)
4190 requiredsize = 2*outsize;
4191 if (unicode_resize(output, requiredsize) < 0)
4192 goto onError;
4193 }
4194 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4195 *outpos += repwlen;
4196 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004198 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004199
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200 /* we made it! */
4201 res = 0;
4202
Benjamin Peterson29060642009-01-31 22:14:21 +00004203 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 Py_XDECREF(restuple);
4205 return res;
4206}
4207
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004208/* --- UTF-7 Codec -------------------------------------------------------- */
4209
Antoine Pitrou244651a2009-05-04 18:56:13 +00004210/* See RFC2152 for details. We encode conservatively and decode liberally. */
4211
4212/* Three simple macros defining base-64. */
4213
4214/* Is c a base-64 character? */
4215
4216#define IS_BASE64(c) \
4217 (((c) >= 'A' && (c) <= 'Z') || \
4218 ((c) >= 'a' && (c) <= 'z') || \
4219 ((c) >= '0' && (c) <= '9') || \
4220 (c) == '+' || (c) == '/')
4221
4222/* given that c is a base-64 character, what is its base-64 value? */
4223
4224#define FROM_BASE64(c) \
4225 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4226 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4227 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4228 (c) == '+' ? 62 : 63)
4229
4230/* What is the base-64 character of the bottom 6 bits of n? */
4231
4232#define TO_BASE64(n) \
4233 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4234
4235/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4236 * decoded as itself. We are permissive on decoding; the only ASCII
4237 * byte not decoding to itself is the + which begins a base64
4238 * string. */
4239
4240#define DECODE_DIRECT(c) \
4241 ((c) <= 127 && (c) != '+')
4242
4243/* The UTF-7 encoder treats ASCII characters differently according to
4244 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4245 * the above). See RFC2152. This array identifies these different
4246 * sets:
4247 * 0 : "Set D"
4248 * alphanumeric and '(),-./:?
4249 * 1 : "Set O"
4250 * !"#$%&*;<=>@[]^_`{|}
4251 * 2 : "whitespace"
4252 * ht nl cr sp
4253 * 3 : special (must be base64 encoded)
4254 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4255 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004256
Tim Petersced69f82003-09-16 20:30:58 +00004257static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004258char utf7_category[128] = {
4259/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4260 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4261/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4262 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4263/* sp ! " # $ % & ' ( ) * + , - . / */
4264 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4265/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4267/* @ A B C D E F G H I J K L M N O */
4268 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4269/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4270 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4271/* ` a b c d e f g h i j k l m n o */
4272 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4273/* p q r s t u v w x y z { | } ~ del */
4274 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004275};
4276
Antoine Pitrou244651a2009-05-04 18:56:13 +00004277/* ENCODE_DIRECT: this character should be encoded as itself. The
4278 * answer depends on whether we are encoding set O as itself, and also
4279 * on whether we are encoding whitespace as itself. RFC2152 makes it
4280 * clear that the answers to these questions vary between
4281 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004282
Antoine Pitrou244651a2009-05-04 18:56:13 +00004283#define ENCODE_DIRECT(c, directO, directWS) \
4284 ((c) < 128 && (c) > 0 && \
4285 ((utf7_category[(c)] == 0) || \
4286 (directWS && (utf7_category[(c)] == 2)) || \
4287 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004288
Alexander Belopolsky40018472011-02-26 01:02:56 +00004289PyObject *
4290PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004291 Py_ssize_t size,
4292 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004293{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004294 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4295}
4296
Antoine Pitrou244651a2009-05-04 18:56:13 +00004297/* The decoder. The only state we preserve is our read position,
4298 * i.e. how many characters we have consumed. So if we end in the
4299 * middle of a shift sequence we have to back off the read position
4300 * and the output to the beginning of the sequence, otherwise we lose
4301 * all the shift state (seen bits, number of bits seen, high
4302 * surrogate). */
4303
Alexander Belopolsky40018472011-02-26 01:02:56 +00004304PyObject *
4305PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004306 Py_ssize_t size,
4307 const char *errors,
4308 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004309{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004311 Py_ssize_t startinpos;
4312 Py_ssize_t endinpos;
4313 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004314 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004315 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004316 const char *errmsg = "";
4317 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004318 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004319 unsigned int base64bits = 0;
4320 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004321 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004322 PyObject *errorHandler = NULL;
4323 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004324
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004325 /* Start off assuming it's all ASCII. Widen later as necessary. */
4326 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004327 if (!unicode)
4328 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004329 if (size == 0) {
4330 if (consumed)
4331 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004332 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004333 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004335 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004336 e = s + size;
4337
4338 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004339 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004340 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004341 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004342
Antoine Pitrou244651a2009-05-04 18:56:13 +00004343 if (inShift) { /* in a base-64 section */
4344 if (IS_BASE64(ch)) { /* consume a base-64 character */
4345 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4346 base64bits += 6;
4347 s++;
4348 if (base64bits >= 16) {
4349 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004350 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004351 base64bits -= 16;
4352 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4353 if (surrogate) {
4354 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004355 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4356 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004357 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4358 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004360 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004361 }
4362 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004363 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4364 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004366 }
4367 }
Victor Stinner551ac952011-11-29 22:58:13 +01004368 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369 /* first surrogate */
4370 surrogate = outCh;
4371 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004372 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004373 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4374 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004375 }
4376 }
4377 }
4378 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004379 inShift = 0;
4380 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004382 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4383 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004384 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004385 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004386 if (base64bits > 0) { /* left-over bits */
4387 if (base64bits >= 6) {
4388 /* We've seen at least one base-64 character */
4389 errmsg = "partial character in shift sequence";
4390 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004391 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004392 else {
4393 /* Some bits remain; they should be zero */
4394 if (base64buffer != 0) {
4395 errmsg = "non-zero padding bits in shift sequence";
4396 goto utf7Error;
4397 }
4398 }
4399 }
4400 if (ch != '-') {
4401 /* '-' is absorbed; other terminating
4402 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004403 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4404 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004406 }
4407 }
4408 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004410 s++; /* consume '+' */
4411 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004412 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004413 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4414 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 }
4416 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004418 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004420 }
4421 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004422 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004423 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4424 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004425 s++;
4426 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004427 else {
4428 startinpos = s-starts;
4429 s++;
4430 errmsg = "unexpected special character";
4431 goto utf7Error;
4432 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004433 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004434utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 endinpos = s-starts;
4436 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004437 errors, &errorHandler,
4438 "utf7", errmsg,
4439 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004440 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004441 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004442 }
4443
Antoine Pitrou244651a2009-05-04 18:56:13 +00004444 /* end of string */
4445
4446 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4447 /* if we're in an inconsistent state, that's an error */
4448 if (surrogate ||
4449 (base64bits >= 6) ||
4450 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 endinpos = size;
4452 if (unicode_decode_call_errorhandler(
4453 errors, &errorHandler,
4454 "utf7", "unterminated shift sequence",
4455 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004456 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004457 goto onError;
4458 if (s < e)
4459 goto restart;
4460 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004461 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462
4463 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004464 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004465 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004466 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004467 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004468 }
4469 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004470 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004472 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004473
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004474 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004475 goto onError;
4476
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477 Py_XDECREF(errorHandler);
4478 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004479 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004482 Py_XDECREF(errorHandler);
4483 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484 Py_DECREF(unicode);
4485 return NULL;
4486}
4487
4488
Alexander Belopolsky40018472011-02-26 01:02:56 +00004489PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004490_PyUnicode_EncodeUTF7(PyObject *str,
4491 int base64SetO,
4492 int base64WhiteSpace,
4493 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004494{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004495 int kind;
4496 void *data;
4497 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004498 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004499 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004500 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004501 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 unsigned int base64bits = 0;
4503 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004504 char * out;
4505 char * start;
4506
Benjamin Petersonbac79492012-01-14 13:34:47 -05004507 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004508 return NULL;
4509 kind = PyUnicode_KIND(str);
4510 data = PyUnicode_DATA(str);
4511 len = PyUnicode_GET_LENGTH(str);
4512
4513 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004515
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004516 /* It might be possible to tighten this worst case */
4517 allocated = 8 * len;
4518 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004519 return PyErr_NoMemory();
4520
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004522 if (v == NULL)
4523 return NULL;
4524
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004525 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004526 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004527 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528
Antoine Pitrou244651a2009-05-04 18:56:13 +00004529 if (inShift) {
4530 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4531 /* shifting out */
4532 if (base64bits) { /* output remaining bits */
4533 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4534 base64buffer = 0;
4535 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536 }
4537 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004538 /* Characters not in the BASE64 set implicitly unshift the sequence
4539 so no '-' is required, except if the character is itself a '-' */
4540 if (IS_BASE64(ch) || ch == '-') {
4541 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 *out++ = (char) ch;
4544 }
4545 else {
4546 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004547 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004548 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004549 else { /* not in a shift sequence */
4550 if (ch == '+') {
4551 *out++ = '+';
4552 *out++ = '-';
4553 }
4554 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4555 *out++ = (char) ch;
4556 }
4557 else {
4558 *out++ = '+';
4559 inShift = 1;
4560 goto encode_char;
4561 }
4562 }
4563 continue;
4564encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004566 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004567
Antoine Pitrou244651a2009-05-04 18:56:13 +00004568 /* code first surrogate */
4569 base64bits += 16;
4570 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4571 while (base64bits >= 6) {
4572 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4573 base64bits -= 6;
4574 }
4575 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004576 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004577 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 base64bits += 16;
4579 base64buffer = (base64buffer << 16) | ch;
4580 while (base64bits >= 6) {
4581 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4582 base64bits -= 6;
4583 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004584 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 if (base64bits)
4586 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4587 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004588 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004589 if (_PyBytes_Resize(&v, out - start) < 0)
4590 return NULL;
4591 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004592}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004593PyObject *
4594PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4595 Py_ssize_t size,
4596 int base64SetO,
4597 int base64WhiteSpace,
4598 const char *errors)
4599{
4600 PyObject *result;
4601 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4602 if (tmp == NULL)
4603 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004604 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004605 base64WhiteSpace, errors);
4606 Py_DECREF(tmp);
4607 return result;
4608}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004609
Antoine Pitrou244651a2009-05-04 18:56:13 +00004610#undef IS_BASE64
4611#undef FROM_BASE64
4612#undef TO_BASE64
4613#undef DECODE_DIRECT
4614#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004615
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616/* --- UTF-8 Codec -------------------------------------------------------- */
4617
Tim Petersced69f82003-09-16 20:30:58 +00004618static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004620 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4621 illegal prefix. See RFC 3629 for details */
4622 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4623 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004624 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004625 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4626 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4627 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4628 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004629 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4632 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004633 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4634 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4635 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4636 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4637 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004638};
4639
Alexander Belopolsky40018472011-02-26 01:02:56 +00004640PyObject *
4641PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004642 Py_ssize_t size,
4643 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004644{
Walter Dörwald69652032004-09-07 20:24:22 +00004645 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4646}
4647
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004648#include "stringlib/ucs1lib.h"
4649#include "stringlib/codecs.h"
4650#include "stringlib/undef.h"
4651
4652#include "stringlib/ucs2lib.h"
4653#include "stringlib/codecs.h"
4654#include "stringlib/undef.h"
4655
4656#include "stringlib/ucs4lib.h"
4657#include "stringlib/codecs.h"
4658#include "stringlib/undef.h"
4659
Antoine Pitrouab868312009-01-10 15:40:25 +00004660/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4661#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4662
4663/* Mask to quickly check whether a C 'long' contains a
4664 non-ASCII, UTF8-encoded char. */
4665#if (SIZEOF_LONG == 8)
4666# define ASCII_CHAR_MASK 0x8080808080808080L
4667#elif (SIZEOF_LONG == 4)
4668# define ASCII_CHAR_MASK 0x80808080L
4669#else
4670# error C 'long' size should be either 4 or 8!
4671#endif
4672
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004673/* Scans a UTF-8 string and returns the maximum character to be expected
4674 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004675
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004676 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004677 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004678 */
4679static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004680utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004681{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004682 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004683 const unsigned char *end = p + string_size;
4684 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004685
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004686 assert(unicode_size != NULL);
4687
4688 /* By having a cascade of independent loops which fallback onto each
4689 other, we minimize the amount of work done in the average loop
4690 iteration, and we also maximize the CPU's ability to predict
4691 branches correctly (because a given condition will have always the
4692 same boolean outcome except perhaps in the last iteration of the
4693 corresponding loop).
4694 In the general case this brings us rather close to decoding
4695 performance pre-PEP 393, despite the two-pass decoding.
4696
4697 Note that the pure ASCII loop is not duplicated once a non-ASCII
4698 character has been encountered. It is actually a pessimization (by
4699 a significant factor) to use this loop on text with many non-ASCII
4700 characters, and it is important to avoid bad performance on valid
4701 utf-8 data (invalid utf-8 being a different can of worms).
4702 */
4703
4704 /* ASCII */
4705 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004706 /* Only check value if it's not a ASCII char... */
4707 if (*p < 0x80) {
4708 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4709 an explanation. */
4710 if (!((size_t) p & LONG_PTR_MASK)) {
4711 /* Help register allocation */
4712 register const unsigned char *_p = p;
4713 while (_p < aligned_end) {
4714 unsigned long value = *(unsigned long *) _p;
4715 if (value & ASCII_CHAR_MASK)
4716 break;
4717 _p += SIZEOF_LONG;
4718 char_count += SIZEOF_LONG;
4719 }
4720 p = _p;
4721 if (p == end)
4722 break;
4723 }
4724 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004725 if (*p < 0x80)
4726 ++char_count;
4727 else
4728 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004729 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004730 *unicode_size = char_count;
4731 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004732
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004733_ucs1loop:
4734 for (; p < end; ++p) {
4735 if (*p < 0xc4)
4736 char_count += ((*p & 0xc0) != 0x80);
4737 else
4738 goto _ucs2loop;
4739 }
4740 *unicode_size = char_count;
4741 return 255;
4742
4743_ucs2loop:
4744 for (; p < end; ++p) {
4745 if (*p < 0xf0)
4746 char_count += ((*p & 0xc0) != 0x80);
4747 else
4748 goto _ucs4loop;
4749 }
4750 *unicode_size = char_count;
4751 return 65535;
4752
4753_ucs4loop:
4754 for (; p < end; ++p) {
4755 char_count += ((*p & 0xc0) != 0x80);
4756 }
4757 *unicode_size = char_count;
4758 return 65537;
4759}
4760
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004761/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004762 in case of errors. Implicit parameters: unicode, kind, data, onError.
4763 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004764*/
Victor Stinner785938e2011-12-11 20:09:03 +01004765#define WRITE_MAYBE_FAIL(index, value) \
4766 do { \
4767 Py_ssize_t pos = index; \
4768 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4769 unicode_resize(&unicode, pos + pos/8) < 0) \
4770 goto onError; \
4771 if (unicode_putchar(&unicode, &pos, value) < 0) \
4772 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004773 } while (0)
4774
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004775static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004776decode_utf8_errors(const char *starts,
4777 Py_ssize_t size,
4778 const char *errors,
4779 Py_ssize_t *consumed,
4780 const char *s,
4781 PyObject *unicode,
4782 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004783{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004785 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004786 Py_ssize_t startinpos;
4787 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004788 const char *e = starts + size;
4789 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004790 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004791 PyObject *errorHandler = NULL;
4792 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004793
Antoine Pitrouab868312009-01-10 15:40:25 +00004794 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004795
4796 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004797 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798
4799 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004800 /* Fast path for runs of ASCII characters. Given that common UTF-8
4801 input will consist of an overwhelming majority of ASCII
4802 characters, we try to optimize for this case by checking
4803 as many characters as a C 'long' can contain.
4804 First, check if we can do an aligned read, as most CPUs have
4805 a penalty for unaligned reads.
4806 */
4807 if (!((size_t) s & LONG_PTR_MASK)) {
4808 /* Help register allocation */
4809 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004810 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004811 while (_s < aligned_end) {
4812 /* Read a whole long at a time (either 4 or 8 bytes),
4813 and do a fast unrolled copy if it only contains ASCII
4814 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004815 unsigned long value = *(unsigned long *) _s;
4816 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004817 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004818 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4819 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4820 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4821 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004822#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004823 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4824 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4825 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4826 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004827#endif
4828 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004829 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004830 }
4831 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004832 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004833 if (s == e)
4834 break;
4835 ch = (unsigned char)*s;
4836 }
4837 }
4838
4839 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004840 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 s++;
4842 continue;
4843 }
4844
4845 n = utf8_code_length[ch];
4846
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004847 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004848 if (consumed)
4849 break;
4850 else {
4851 errmsg = "unexpected end of data";
4852 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004853 endinpos = startinpos+1;
4854 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4855 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004856 goto utf8Error;
4857 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859
4860 switch (n) {
4861
4862 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004863 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004864 startinpos = s-starts;
4865 endinpos = startinpos+1;
4866 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867
4868 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004869 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004870 startinpos = s-starts;
4871 endinpos = startinpos+1;
4872 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873
4874 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004875 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004876 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004877 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004878 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004879 goto utf8Error;
4880 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004882 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004883 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884 break;
4885
4886 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004887 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4888 will result in surrogates in range d800-dfff. Surrogates are
4889 not valid UTF-8 so they are rejected.
4890 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4891 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004892 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004893 (s[2] & 0xc0) != 0x80 ||
4894 ((unsigned char)s[0] == 0xE0 &&
4895 (unsigned char)s[1] < 0xA0) ||
4896 ((unsigned char)s[0] == 0xED &&
4897 (unsigned char)s[1] > 0x9F)) {
4898 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004899 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004900 endinpos = startinpos + 1;
4901
4902 /* if s[1] first two bits are 1 and 0, then the invalid
4903 continuation byte is s[2], so increment endinpos by 1,
4904 if not, s[1] is invalid and endinpos doesn't need to
4905 be incremented. */
4906 if ((s[1] & 0xC0) == 0x80)
4907 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004908 goto utf8Error;
4909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004911 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004912 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004913 break;
4914
4915 case 4:
4916 if ((s[1] & 0xc0) != 0x80 ||
4917 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004918 (s[3] & 0xc0) != 0x80 ||
4919 ((unsigned char)s[0] == 0xF0 &&
4920 (unsigned char)s[1] < 0x90) ||
4921 ((unsigned char)s[0] == 0xF4 &&
4922 (unsigned char)s[1] > 0x8F)) {
4923 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004924 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004925 endinpos = startinpos + 1;
4926 if ((s[1] & 0xC0) == 0x80) {
4927 endinpos++;
4928 if ((s[2] & 0xC0) == 0x80)
4929 endinpos++;
4930 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004931 goto utf8Error;
4932 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004933 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004934 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004935 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004936
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004937 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939 }
4940 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004942
Benjamin Peterson29060642009-01-31 22:14:21 +00004943 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004944 if (unicode_decode_call_errorhandler(
4945 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01004946 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00004947 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004948 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004949 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004950 /* Update data because unicode_decode_call_errorhandler might have
4951 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004952 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953 }
Walter Dörwald69652032004-09-07 20:24:22 +00004954 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004957 /* Adjust length and ready string when it contained errors and
4958 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004959 if (unicode_resize(&unicode, i) < 0)
4960 goto onError;
4961 unicode_adjust_maxchar(&unicode);
4962 if (unicode == NULL)
4963 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004965 Py_XDECREF(errorHandler);
4966 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004967 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004968 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969
Benjamin Peterson29060642009-01-31 22:14:21 +00004970 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004971 Py_XDECREF(errorHandler);
4972 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004973 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974 return NULL;
4975}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004976#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004977
Victor Stinner785938e2011-12-11 20:09:03 +01004978PyObject *
4979PyUnicode_DecodeUTF8Stateful(const char *s,
4980 Py_ssize_t size,
4981 const char *errors,
4982 Py_ssize_t *consumed)
4983{
4984 Py_UCS4 maxchar = 0;
4985 Py_ssize_t unicode_size;
4986 int has_errors = 0;
4987 PyObject *unicode;
4988 int kind;
4989 void *data;
4990 const char *starts = s;
4991 const char *e;
4992 Py_ssize_t i;
4993
4994 if (size == 0) {
4995 if (consumed)
4996 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004997 Py_INCREF(unicode_empty);
4998 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004999 }
5000
Victor Stinnera1d12bb2011-12-11 21:53:09 +01005001 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01005002
5003 /* When the string is ASCII only, just use memcpy and return.
5004 unicode_size may be != size if there is an incomplete UTF-8
5005 sequence at the end of the ASCII block. */
5006 if (maxchar < 128 && size == unicode_size) {
5007 if (consumed)
5008 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01005009 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01005010 }
5011
5012 unicode = PyUnicode_New(unicode_size, maxchar);
5013 if (!unicode)
5014 return NULL;
5015 kind = PyUnicode_KIND(unicode);
5016 data = PyUnicode_DATA(unicode);
5017
5018 /* Unpack UTF-8 encoded data */
5019 i = 0;
5020 e = starts + size;
5021 switch (kind) {
5022 case PyUnicode_1BYTE_KIND:
5023 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
5024 break;
5025 case PyUnicode_2BYTE_KIND:
5026 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
5027 break;
5028 case PyUnicode_4BYTE_KIND:
5029 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
5030 break;
5031 }
5032 if (!has_errors) {
5033 /* Ensure the unicode size calculation was correct */
5034 assert(i == unicode_size);
5035 assert(s == e);
5036 if (consumed)
5037 *consumed = size;
5038 return unicode;
5039 }
5040
5041 /* In case of errors, maxchar and size computation might be incorrect;
5042 code below refits and resizes as necessary. */
5043 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
5044}
5045
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005046#ifdef __APPLE__
5047
5048/* Simplified UTF-8 decoder using surrogateescape error handler,
5049 used to decode the command line arguments on Mac OS X. */
5050
5051wchar_t*
5052_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5053{
5054 int n;
5055 const char *e;
5056 wchar_t *unicode, *p;
5057
5058 /* Note: size will always be longer than the resulting Unicode
5059 character count */
5060 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
5061 PyErr_NoMemory();
5062 return NULL;
5063 }
5064 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
5065 if (!unicode)
5066 return NULL;
5067
5068 /* Unpack UTF-8 encoded data */
5069 p = unicode;
5070 e = s + size;
5071 while (s < e) {
5072 Py_UCS4 ch = (unsigned char)*s;
5073
5074 if (ch < 0x80) {
5075 *p++ = (wchar_t)ch;
5076 s++;
5077 continue;
5078 }
5079
5080 n = utf8_code_length[ch];
5081 if (s + n > e) {
5082 goto surrogateescape;
5083 }
5084
5085 switch (n) {
5086 case 0:
5087 case 1:
5088 goto surrogateescape;
5089
5090 case 2:
5091 if ((s[1] & 0xc0) != 0x80)
5092 goto surrogateescape;
5093 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
5094 assert ((ch > 0x007F) && (ch <= 0x07FF));
5095 *p++ = (wchar_t)ch;
5096 break;
5097
5098 case 3:
5099 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
5100 will result in surrogates in range d800-dfff. Surrogates are
5101 not valid UTF-8 so they are rejected.
5102 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
5103 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
5104 if ((s[1] & 0xc0) != 0x80 ||
5105 (s[2] & 0xc0) != 0x80 ||
5106 ((unsigned char)s[0] == 0xE0 &&
5107 (unsigned char)s[1] < 0xA0) ||
5108 ((unsigned char)s[0] == 0xED &&
5109 (unsigned char)s[1] > 0x9F)) {
5110
5111 goto surrogateescape;
5112 }
5113 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5114 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005115 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005116 break;
5117
5118 case 4:
5119 if ((s[1] & 0xc0) != 0x80 ||
5120 (s[2] & 0xc0) != 0x80 ||
5121 (s[3] & 0xc0) != 0x80 ||
5122 ((unsigned char)s[0] == 0xF0 &&
5123 (unsigned char)s[1] < 0x90) ||
5124 ((unsigned char)s[0] == 0xF4 &&
5125 (unsigned char)s[1] > 0x8F)) {
5126 goto surrogateescape;
5127 }
5128 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
5129 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01005130 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005131
5132#if SIZEOF_WCHAR_T == 4
5133 *p++ = (wchar_t)ch;
5134#else
5135 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01005136 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5137 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005138#endif
5139 break;
5140 }
5141 s += n;
5142 continue;
5143
5144 surrogateescape:
5145 *p++ = 0xDC00 + ch;
5146 s++;
5147 }
5148 *p = L'\0';
5149 return unicode;
5150}
5151
5152#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005154/* Primary internal function which creates utf8 encoded bytes objects.
5155
5156 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005157 and allocate exactly as much space needed at the end. Else allocate the
5158 maximum possible needed (4 result bytes per Unicode character), and return
5159 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005160*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005161PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005162_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163{
Victor Stinner6099a032011-12-18 14:22:26 +01005164 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005165 void *data;
5166 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005168 if (!PyUnicode_Check(unicode)) {
5169 PyErr_BadArgument();
5170 return NULL;
5171 }
5172
5173 if (PyUnicode_READY(unicode) == -1)
5174 return NULL;
5175
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005176 if (PyUnicode_UTF8(unicode))
5177 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5178 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005179
5180 kind = PyUnicode_KIND(unicode);
5181 data = PyUnicode_DATA(unicode);
5182 size = PyUnicode_GET_LENGTH(unicode);
5183
Benjamin Petersonead6b532011-12-20 17:23:42 -06005184 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005185 default:
5186 assert(0);
5187 case PyUnicode_1BYTE_KIND:
5188 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5189 assert(!PyUnicode_IS_ASCII(unicode));
5190 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5191 case PyUnicode_2BYTE_KIND:
5192 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5193 case PyUnicode_4BYTE_KIND:
5194 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005195 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196}
5197
Alexander Belopolsky40018472011-02-26 01:02:56 +00005198PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005199PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5200 Py_ssize_t size,
5201 const char *errors)
5202{
5203 PyObject *v, *unicode;
5204
5205 unicode = PyUnicode_FromUnicode(s, size);
5206 if (unicode == NULL)
5207 return NULL;
5208 v = _PyUnicode_AsUTF8String(unicode, errors);
5209 Py_DECREF(unicode);
5210 return v;
5211}
5212
5213PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005214PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005216 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217}
5218
Walter Dörwald41980ca2007-08-16 21:55:45 +00005219/* --- UTF-32 Codec ------------------------------------------------------- */
5220
5221PyObject *
5222PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005223 Py_ssize_t size,
5224 const char *errors,
5225 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005226{
5227 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5228}
5229
5230PyObject *
5231PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 Py_ssize_t size,
5233 const char *errors,
5234 int *byteorder,
5235 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005236{
5237 const char *starts = s;
5238 Py_ssize_t startinpos;
5239 Py_ssize_t endinpos;
5240 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005241 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005242 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005243 int bo = 0; /* assume native ordering by default */
5244 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005245 /* Offsets from q for retrieving bytes in the right order. */
5246#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5247 int iorder[] = {0, 1, 2, 3};
5248#else
5249 int iorder[] = {3, 2, 1, 0};
5250#endif
5251 PyObject *errorHandler = NULL;
5252 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005253
Walter Dörwald41980ca2007-08-16 21:55:45 +00005254 q = (unsigned char *)s;
5255 e = q + size;
5256
5257 if (byteorder)
5258 bo = *byteorder;
5259
5260 /* Check for BOM marks (U+FEFF) in the input and adjust current
5261 byte order setting accordingly. In native mode, the leading BOM
5262 mark is skipped, in all other modes, it is copied to the output
5263 stream as-is (giving a ZWNBSP character). */
5264 if (bo == 0) {
5265 if (size >= 4) {
5266 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005267 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005268#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005269 if (bom == 0x0000FEFF) {
5270 q += 4;
5271 bo = -1;
5272 }
5273 else if (bom == 0xFFFE0000) {
5274 q += 4;
5275 bo = 1;
5276 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005277#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005278 if (bom == 0x0000FEFF) {
5279 q += 4;
5280 bo = 1;
5281 }
5282 else if (bom == 0xFFFE0000) {
5283 q += 4;
5284 bo = -1;
5285 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005286#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005287 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005288 }
5289
5290 if (bo == -1) {
5291 /* force LE */
5292 iorder[0] = 0;
5293 iorder[1] = 1;
5294 iorder[2] = 2;
5295 iorder[3] = 3;
5296 }
5297 else if (bo == 1) {
5298 /* force BE */
5299 iorder[0] = 3;
5300 iorder[1] = 2;
5301 iorder[2] = 1;
5302 iorder[3] = 0;
5303 }
5304
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005305 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005306 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005307 if (!unicode)
5308 return NULL;
5309 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005310 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005311 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005312
Walter Dörwald41980ca2007-08-16 21:55:45 +00005313 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005314 Py_UCS4 ch;
5315 /* remaining bytes at the end? (size should be divisible by 4) */
5316 if (e-q<4) {
5317 if (consumed)
5318 break;
5319 errmsg = "truncated data";
5320 startinpos = ((const char *)q)-starts;
5321 endinpos = ((const char *)e)-starts;
5322 goto utf32Error;
5323 /* The remaining input chars are ignored if the callback
5324 chooses to skip the input */
5325 }
5326 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5327 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005328
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 if (ch >= 0x110000)
5330 {
5331 errmsg = "codepoint not in range(0x110000)";
5332 startinpos = ((const char *)q)-starts;
5333 endinpos = startinpos+4;
5334 goto utf32Error;
5335 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005336 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5337 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 q += 4;
5339 continue;
5340 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005341 if (unicode_decode_call_errorhandler(
5342 errors, &errorHandler,
5343 "utf32", errmsg,
5344 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005345 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005347 }
5348
5349 if (byteorder)
5350 *byteorder = bo;
5351
5352 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005354
5355 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005356 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005357 goto onError;
5358
5359 Py_XDECREF(errorHandler);
5360 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005361 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005362
Benjamin Peterson29060642009-01-31 22:14:21 +00005363 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005364 Py_DECREF(unicode);
5365 Py_XDECREF(errorHandler);
5366 Py_XDECREF(exc);
5367 return NULL;
5368}
5369
5370PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005371_PyUnicode_EncodeUTF32(PyObject *str,
5372 const char *errors,
5373 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005374{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005375 int kind;
5376 void *data;
5377 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005378 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005379 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005380 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005381 /* Offsets from p for storing byte pairs in the right order. */
5382#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5383 int iorder[] = {0, 1, 2, 3};
5384#else
5385 int iorder[] = {3, 2, 1, 0};
5386#endif
5387
Benjamin Peterson29060642009-01-31 22:14:21 +00005388#define STORECHAR(CH) \
5389 do { \
5390 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5391 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5392 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5393 p[iorder[0]] = (CH) & 0xff; \
5394 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005395 } while(0)
5396
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005397 if (!PyUnicode_Check(str)) {
5398 PyErr_BadArgument();
5399 return NULL;
5400 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005401 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005402 return NULL;
5403 kind = PyUnicode_KIND(str);
5404 data = PyUnicode_DATA(str);
5405 len = PyUnicode_GET_LENGTH(str);
5406
5407 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005408 bytesize = nsize * 4;
5409 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005410 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005411 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005412 if (v == NULL)
5413 return NULL;
5414
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005415 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005416 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005417 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005418 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005419 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005420
5421 if (byteorder == -1) {
5422 /* force LE */
5423 iorder[0] = 0;
5424 iorder[1] = 1;
5425 iorder[2] = 2;
5426 iorder[3] = 3;
5427 }
5428 else if (byteorder == 1) {
5429 /* force BE */
5430 iorder[0] = 3;
5431 iorder[1] = 2;
5432 iorder[2] = 1;
5433 iorder[3] = 0;
5434 }
5435
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005436 for (i = 0; i < len; i++)
5437 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005438
5439 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005440 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005441#undef STORECHAR
5442}
5443
Alexander Belopolsky40018472011-02-26 01:02:56 +00005444PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005445PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5446 Py_ssize_t size,
5447 const char *errors,
5448 int byteorder)
5449{
5450 PyObject *result;
5451 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5452 if (tmp == NULL)
5453 return NULL;
5454 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5455 Py_DECREF(tmp);
5456 return result;
5457}
5458
5459PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005460PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005461{
Victor Stinnerb960b342011-11-20 19:12:52 +01005462 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005463}
5464
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465/* --- UTF-16 Codec ------------------------------------------------------- */
5466
Tim Peters772747b2001-08-09 22:21:55 +00005467PyObject *
5468PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005469 Py_ssize_t size,
5470 const char *errors,
5471 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472{
Walter Dörwald69652032004-09-07 20:24:22 +00005473 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5474}
5475
Antoine Pitrouab868312009-01-10 15:40:25 +00005476/* Two masks for fast checking of whether a C 'long' may contain
5477 UTF16-encoded surrogate characters. This is an efficient heuristic,
5478 assuming that non-surrogate characters with a code point >= 0x8000 are
5479 rare in most input.
5480 FAST_CHAR_MASK is used when the input is in native byte ordering,
5481 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005482*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005483#if (SIZEOF_LONG == 8)
5484# define FAST_CHAR_MASK 0x8000800080008000L
5485# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005486# define STRIPPED_MASK 0x00FF00FF00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005487#elif (SIZEOF_LONG == 4)
5488# define FAST_CHAR_MASK 0x80008000L
5489# define SWAPPED_FAST_CHAR_MASK 0x00800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005490# define STRIPPED_MASK 0x00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005491#else
5492# error C 'long' size should be either 4 or 8!
5493#endif
5494
Walter Dörwald69652032004-09-07 20:24:22 +00005495PyObject *
5496PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 Py_ssize_t size,
5498 const char *errors,
5499 int *byteorder,
5500 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005501{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005502 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005503 Py_ssize_t startinpos;
5504 Py_ssize_t endinpos;
5505 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005506 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005507 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005508 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005509 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005510 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005511 /* Offsets from q for retrieving byte pairs in the right order. */
5512#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5513 int ihi = 1, ilo = 0;
5514#else
5515 int ihi = 0, ilo = 1;
5516#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005517 PyObject *errorHandler = NULL;
5518 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519
5520 /* Note: size will always be longer than the resulting Unicode
5521 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005522 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 if (!unicode)
5524 return NULL;
5525 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005526 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005527 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528
Tim Peters772747b2001-08-09 22:21:55 +00005529 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005530 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005531
5532 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005533 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005534
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005535 /* Check for BOM marks (U+FEFF) in the input and adjust current
5536 byte order setting accordingly. In native mode, the leading BOM
5537 mark is skipped, in all other modes, it is copied to the output
5538 stream as-is (giving a ZWNBSP character). */
5539 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005540 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005541 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005542#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005543 if (bom == 0xFEFF) {
5544 q += 2;
5545 bo = -1;
5546 }
5547 else if (bom == 0xFFFE) {
5548 q += 2;
5549 bo = 1;
5550 }
Tim Petersced69f82003-09-16 20:30:58 +00005551#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005552 if (bom == 0xFEFF) {
5553 q += 2;
5554 bo = 1;
5555 }
5556 else if (bom == 0xFFFE) {
5557 q += 2;
5558 bo = -1;
5559 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005560#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005561 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563
Tim Peters772747b2001-08-09 22:21:55 +00005564 if (bo == -1) {
5565 /* force LE */
5566 ihi = 1;
5567 ilo = 0;
5568 }
5569 else if (bo == 1) {
5570 /* force BE */
5571 ihi = 0;
5572 ilo = 1;
5573 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005574#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5575 native_ordering = ilo < ihi;
5576#else
5577 native_ordering = ilo > ihi;
5578#endif
Tim Peters772747b2001-08-09 22:21:55 +00005579
Antoine Pitrouab868312009-01-10 15:40:25 +00005580 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005581 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005582 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005583 /* First check for possible aligned read of a C 'long'. Unaligned
5584 reads are more expensive, better to defer to another iteration. */
5585 if (!((size_t) q & LONG_PTR_MASK)) {
5586 /* Fast path for runs of non-surrogate chars. */
5587 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005588 int kind = PyUnicode_KIND(unicode);
5589 void *data = PyUnicode_DATA(unicode);
5590 while (_q < aligned_end) {
Victor Stinnerafb52052012-04-05 22:54:49 +02005591 unsigned long block = * (unsigned long *) _q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005592 Py_UCS4 maxch;
5593 if (native_ordering) {
5594 /* Can use buffer directly */
Victor Stinnerafb52052012-04-05 22:54:49 +02005595 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005596 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005597 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005598 else {
5599 /* Need to byte-swap */
Victor Stinnerafb52052012-04-05 22:54:49 +02005600 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005601 break;
Victor Stinnerafb52052012-04-05 22:54:49 +02005602 block = ((block >> 8) & STRIPPED_MASK) |
5603 ((block & STRIPPED_MASK) << 8);
Antoine Pitrouab868312009-01-10 15:40:25 +00005604 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005605 maxch = (Py_UCS2)(block & 0xFFFF);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005606#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005607 ch = (Py_UCS2)((block >> 16) & 0xFFFF);
Victor Stinnere6abb482012-05-02 01:15:40 +02005608 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005609 ch = (Py_UCS2)((block >> 32) & 0xFFFF);
Victor Stinnere6abb482012-05-02 01:15:40 +02005610 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005611 ch = (Py_UCS2)(block >> 48);
Victor Stinnere6abb482012-05-02 01:15:40 +02005612 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005613#else
5614 ch = (Py_UCS2)(block >> 16);
Victor Stinnere6abb482012-05-02 01:15:40 +02005615 maxch = MAX_MAXCHAR(maxch, ch);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005616#endif
5617 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
Victor Stinner1b487b42012-05-03 12:29:04 +02005618 if (unicode_widen(&unicode, outpos, maxch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005619 goto onError;
5620 kind = PyUnicode_KIND(unicode);
5621 data = PyUnicode_DATA(unicode);
5622 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005623#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5624 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005625#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005626 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5627 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5628 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5629#else
5630 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5631#endif
5632#else
5633#if SIZEOF_LONG == 8
5634 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5635 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5636 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5637#else
5638 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5639#endif
5640 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005641#endif
5642 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005643 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005644 q = _q;
5645 if (q >= e)
5646 break;
5647 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005648 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005649
Benjamin Peterson14339b62009-01-31 16:36:08 +00005650 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005651
Victor Stinner551ac952011-11-29 22:58:13 +01005652 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005653 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5654 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 continue;
5656 }
5657
5658 /* UTF-16 code pair: */
5659 if (q > e) {
5660 errmsg = "unexpected end of data";
5661 startinpos = (((const char *)q) - 2) - starts;
5662 endinpos = ((const char *)e) + 1 - starts;
5663 goto utf16Error;
5664 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005665 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5666 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005667 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005668 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005669 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005670 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005671 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 continue;
5673 }
5674 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005675 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005676 startinpos = (((const char *)q)-4)-starts;
5677 endinpos = startinpos+2;
5678 goto utf16Error;
5679 }
5680
Benjamin Peterson14339b62009-01-31 16:36:08 +00005681 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 errmsg = "illegal encoding";
5683 startinpos = (((const char *)q)-2)-starts;
5684 endinpos = startinpos+2;
5685 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005686
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005689 errors,
5690 &errorHandler,
5691 "utf16", errmsg,
5692 &starts,
5693 (const char **)&e,
5694 &startinpos,
5695 &endinpos,
5696 &exc,
5697 (const char **)&q,
5698 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005699 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005702 /* remaining byte at the end? (size should be even) */
5703 if (e == q) {
5704 if (!consumed) {
5705 errmsg = "truncated data";
5706 startinpos = ((const char *)q) - starts;
5707 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005708 if (unicode_decode_call_errorhandler(
5709 errors,
5710 &errorHandler,
5711 "utf16", errmsg,
5712 &starts,
5713 (const char **)&e,
5714 &startinpos,
5715 &endinpos,
5716 &exc,
5717 (const char **)&q,
5718 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005719 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005720 goto onError;
5721 /* The remaining input chars are ignored if the callback
5722 chooses to skip the input */
5723 }
5724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725
5726 if (byteorder)
5727 *byteorder = bo;
5728
Walter Dörwald69652032004-09-07 20:24:22 +00005729 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005731
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005733 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 goto onError;
5735
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005736 Py_XDECREF(errorHandler);
5737 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005738 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739
Benjamin Peterson29060642009-01-31 22:14:21 +00005740 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005742 Py_XDECREF(errorHandler);
5743 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 return NULL;
5745}
5746
Antoine Pitrouab868312009-01-10 15:40:25 +00005747#undef FAST_CHAR_MASK
5748#undef SWAPPED_FAST_CHAR_MASK
5749
Tim Peters772747b2001-08-09 22:21:55 +00005750PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005751_PyUnicode_EncodeUTF16(PyObject *str,
5752 const char *errors,
5753 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005755 int kind;
5756 void *data;
5757 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005758 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005759 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005760 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005761 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005762 /* Offsets from p for storing byte pairs in the right order. */
5763#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5764 int ihi = 1, ilo = 0;
5765#else
5766 int ihi = 0, ilo = 1;
5767#endif
5768
Benjamin Peterson29060642009-01-31 22:14:21 +00005769#define STORECHAR(CH) \
5770 do { \
5771 p[ihi] = ((CH) >> 8) & 0xff; \
5772 p[ilo] = (CH) & 0xff; \
5773 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005774 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005775
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005776 if (!PyUnicode_Check(str)) {
5777 PyErr_BadArgument();
5778 return NULL;
5779 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005780 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005781 return NULL;
5782 kind = PyUnicode_KIND(str);
5783 data = PyUnicode_DATA(str);
5784 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005785
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005786 pairs = 0;
5787 if (kind == PyUnicode_4BYTE_KIND)
5788 for (i = 0; i < len; i++)
5789 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5790 pairs++;
5791 /* 2 * (len + pairs + (byteorder == 0)) */
5792 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005794 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005795 bytesize = nsize * 2;
5796 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005797 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005798 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799 if (v == NULL)
5800 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005802 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005804 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005805 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005806 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005807
5808 if (byteorder == -1) {
5809 /* force LE */
5810 ihi = 1;
5811 ilo = 0;
5812 }
5813 else if (byteorder == 1) {
5814 /* force BE */
5815 ihi = 0;
5816 ilo = 1;
5817 }
5818
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005819 for (i = 0; i < len; i++) {
5820 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5821 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005823 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5824 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005825 }
Tim Peters772747b2001-08-09 22:21:55 +00005826 STORECHAR(ch);
5827 if (ch2)
5828 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005829 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005830
5831 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005832 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005833#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834}
5835
Alexander Belopolsky40018472011-02-26 01:02:56 +00005836PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005837PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5838 Py_ssize_t size,
5839 const char *errors,
5840 int byteorder)
5841{
5842 PyObject *result;
5843 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5844 if (tmp == NULL)
5845 return NULL;
5846 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5847 Py_DECREF(tmp);
5848 return result;
5849}
5850
5851PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005852PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005854 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855}
5856
5857/* --- Unicode Escape Codec ----------------------------------------------- */
5858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005859/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5860 if all the escapes in the string make it still a valid ASCII string.
5861 Returns -1 if any escapes were found which cause the string to
5862 pop out of ASCII range. Otherwise returns the length of the
5863 required buffer to hold the string.
5864 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005865static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005866length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5867{
5868 const unsigned char *p = (const unsigned char *)s;
5869 const unsigned char *end = p + size;
5870 Py_ssize_t length = 0;
5871
5872 if (size < 0)
5873 return -1;
5874
5875 for (; p < end; ++p) {
5876 if (*p > 127) {
5877 /* Non-ASCII */
5878 return -1;
5879 }
5880 else if (*p != '\\') {
5881 /* Normal character */
5882 ++length;
5883 }
5884 else {
5885 /* Backslash-escape, check next char */
5886 ++p;
5887 /* Escape sequence reaches till end of string or
5888 non-ASCII follow-up. */
5889 if (p >= end || *p > 127)
5890 return -1;
5891 switch (*p) {
5892 case '\n':
5893 /* backslash + \n result in zero characters */
5894 break;
5895 case '\\': case '\'': case '\"':
5896 case 'b': case 'f': case 't':
5897 case 'n': case 'r': case 'v': case 'a':
5898 ++length;
5899 break;
5900 case '0': case '1': case '2': case '3':
5901 case '4': case '5': case '6': case '7':
5902 case 'x': case 'u': case 'U': case 'N':
5903 /* these do not guarantee ASCII characters */
5904 return -1;
5905 default:
5906 /* count the backslash + the other character */
5907 length += 2;
5908 }
5909 }
5910 }
5911 return length;
5912}
5913
Fredrik Lundh06d12682001-01-24 07:59:11 +00005914static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005915
Alexander Belopolsky40018472011-02-26 01:02:56 +00005916PyObject *
5917PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005918 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005919 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005921 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005922 Py_ssize_t startinpos;
5923 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005924 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005925 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005926 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005927 char* message;
5928 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005929 PyObject *errorHandler = NULL;
5930 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005931 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005932 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005933
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005934 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005935
5936 /* After length_of_escaped_ascii_string() there are two alternatives,
5937 either the string is pure ASCII with named escapes like \n, etc.
5938 and we determined it's exact size (common case)
5939 or it contains \x, \u, ... escape sequences. then we create a
5940 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005941 if (len >= 0) {
5942 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005943 if (!v)
5944 goto onError;
5945 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005946 }
5947 else {
5948 /* Escaped strings will always be longer than the resulting
5949 Unicode string, so we start with size here and then reduce the
5950 length after conversion to the true value.
5951 (but if the error callback returns a long replacement string
5952 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005953 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005954 if (!v)
5955 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005956 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005957 }
5958
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005960 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005961 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005962 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005963
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964 while (s < end) {
5965 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005966 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005967 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005969 /* The only case in which i == ascii_length is a backslash
5970 followed by a newline. */
5971 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005972
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 /* Non-escape characters are interpreted as Unicode ordinals */
5974 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005975 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5976 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 continue;
5978 }
5979
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005980 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 /* \ - Escapes */
5982 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005983 c = *s++;
5984 if (s > end)
5985 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005986
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005987 /* The only case in which i == ascii_length is a backslash
5988 followed by a newline. */
5989 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005990
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005991 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992
Benjamin Peterson29060642009-01-31 22:14:21 +00005993 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005994#define WRITECHAR(ch) \
5995 do { \
5996 if (unicode_putchar(&v, &i, ch) < 0) \
5997 goto onError; \
5998 }while(0)
5999
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006001 case '\\': WRITECHAR('\\'); break;
6002 case '\'': WRITECHAR('\''); break;
6003 case '\"': WRITECHAR('\"'); break;
6004 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006005 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006006 case 'f': WRITECHAR('\014'); break;
6007 case 't': WRITECHAR('\t'); break;
6008 case 'n': WRITECHAR('\n'); break;
6009 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006010 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006011 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006012 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006013 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 case '0': case '1': case '2': case '3':
6017 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006018 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006019 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006020 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006021 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006022 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006024 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 break;
6026
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 /* hex escapes */
6028 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006030 digits = 2;
6031 message = "truncated \\xXX escape";
6032 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033
Benjamin Peterson29060642009-01-31 22:14:21 +00006034 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006036 digits = 4;
6037 message = "truncated \\uXXXX escape";
6038 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006041 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006042 digits = 8;
6043 message = "truncated \\UXXXXXXXX escape";
6044 hexescape:
6045 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006046 if (s+digits>end) {
6047 endinpos = size;
6048 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006049 errors, &errorHandler,
6050 "unicodeescape", "end of string in escape sequence",
6051 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006052 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006053 goto onError;
6054 goto nextByte;
6055 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006056 for (j = 0; j < digits; ++j) {
6057 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00006058 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006059 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006060 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006061 errors, &errorHandler,
6062 "unicodeescape", message,
6063 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006064 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006065 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006066 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006067 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006068 }
6069 chr = (chr<<4) & ~0xF;
6070 if (c >= '0' && c <= '9')
6071 chr += c - '0';
6072 else if (c >= 'a' && c <= 'f')
6073 chr += 10 + c - 'a';
6074 else
6075 chr += 10 + c - 'A';
6076 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006077 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00006078 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079 /* _decoding_error will have already written into the
6080 target buffer. */
6081 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006082 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00006083 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01006084 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006085 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00006086 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006087 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006088 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006089 errors, &errorHandler,
6090 "unicodeescape", "illegal Unicode character",
6091 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006092 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006093 goto onError;
6094 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006095 break;
6096
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006098 case 'N':
6099 message = "malformed \\N character escape";
6100 if (ucnhash_CAPI == NULL) {
6101 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006102 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6103 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006104 if (ucnhash_CAPI == NULL)
6105 goto ucnhashError;
6106 }
6107 if (*s == '{') {
6108 const char *start = s+1;
6109 /* look for the closing brace */
6110 while (*s != '}' && s < end)
6111 s++;
6112 if (s > start && s < end && *s == '}') {
6113 /* found a name. look it up in the unicode database */
6114 message = "unknown Unicode character name";
6115 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006116 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006117 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006118 goto store;
6119 }
6120 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006121 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006122 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 errors, &errorHandler,
6124 "unicodeescape", message,
6125 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006126 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006127 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006128 break;
6129
6130 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006131 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006132 message = "\\ at end of string";
6133 s--;
6134 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006135 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 errors, &errorHandler,
6137 "unicodeescape", message,
6138 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006139 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00006140 goto onError;
6141 }
6142 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006143 WRITECHAR('\\');
6144 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006145 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006146 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006148 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006149 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006151#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006152
Victor Stinner16e6a802011-12-12 13:24:15 +01006153 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006154 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006155 Py_XDECREF(errorHandler);
6156 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006157 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00006158
Benjamin Peterson29060642009-01-31 22:14:21 +00006159 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006160 PyErr_SetString(
6161 PyExc_UnicodeError,
6162 "\\N escapes not supported (can't load unicodedata module)"
6163 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006164 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006165 Py_XDECREF(errorHandler);
6166 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006167 return NULL;
6168
Benjamin Peterson29060642009-01-31 22:14:21 +00006169 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006171 Py_XDECREF(errorHandler);
6172 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 return NULL;
6174}
6175
6176/* Return a Unicode-Escape string version of the Unicode object.
6177
6178 If quotes is true, the string is enclosed in u"" or u'' quotes as
6179 appropriate.
6180
6181*/
6182
Alexander Belopolsky40018472011-02-26 01:02:56 +00006183PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006184PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006186 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006187 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006189 int kind;
6190 void *data;
6191 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192
Thomas Wouters89f507f2006-12-13 04:49:30 +00006193 /* Initial allocation is based on the longest-possible unichr
6194 escape.
6195
6196 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6197 unichr, so in this case it's the longest unichr escape. In
6198 narrow (UTF-16) builds this is five chars per source unichr
6199 since there are two unichrs in the surrogate pair, so in narrow
6200 (UTF-16) builds it's not the longest unichr escape.
6201
6202 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6203 so in the narrow (UTF-16) build case it's the longest unichr
6204 escape.
6205 */
6206
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006207 if (!PyUnicode_Check(unicode)) {
6208 PyErr_BadArgument();
6209 return NULL;
6210 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006211 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006212 return NULL;
6213 len = PyUnicode_GET_LENGTH(unicode);
6214 kind = PyUnicode_KIND(unicode);
6215 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006216 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006217 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6218 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6219 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6220 }
6221
6222 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006223 return PyBytes_FromStringAndSize(NULL, 0);
6224
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006225 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006227
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006228 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006230 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 if (repr == NULL)
6233 return NULL;
6234
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006235 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006237 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006238 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006239
Walter Dörwald79e913e2007-05-12 11:08:06 +00006240 /* Escape backslashes */
6241 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242 *p++ = '\\';
6243 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006244 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006245 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006246
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006247 /* Map 21-bit characters to '\U00xxxxxx' */
6248 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006249 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006250 *p++ = '\\';
6251 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006252 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6253 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6254 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6255 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6256 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6257 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6258 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6259 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006261 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006262
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006264 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006265 *p++ = '\\';
6266 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006267 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6268 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6269 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6270 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006272
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006273 /* Map special whitespace to '\t', \n', '\r' */
6274 else if (ch == '\t') {
6275 *p++ = '\\';
6276 *p++ = 't';
6277 }
6278 else if (ch == '\n') {
6279 *p++ = '\\';
6280 *p++ = 'n';
6281 }
6282 else if (ch == '\r') {
6283 *p++ = '\\';
6284 *p++ = 'r';
6285 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006286
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006287 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006288 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006290 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006291 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6292 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006293 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006294
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295 /* Copy everything else as-is */
6296 else
6297 *p++ = (char) ch;
6298 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006299
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006300 assert(p - PyBytes_AS_STRING(repr) > 0);
6301 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6302 return NULL;
6303 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304}
6305
Alexander Belopolsky40018472011-02-26 01:02:56 +00006306PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006307PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6308 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006310 PyObject *result;
6311 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6312 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006314 result = PyUnicode_AsUnicodeEscapeString(tmp);
6315 Py_DECREF(tmp);
6316 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317}
6318
6319/* --- Raw Unicode Escape Codec ------------------------------------------- */
6320
Alexander Belopolsky40018472011-02-26 01:02:56 +00006321PyObject *
6322PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006323 Py_ssize_t size,
6324 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006325{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006326 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006327 Py_ssize_t startinpos;
6328 Py_ssize_t endinpos;
6329 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006330 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331 const char *end;
6332 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006333 PyObject *errorHandler = NULL;
6334 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006335
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336 /* Escaped strings will always be longer than the resulting
6337 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006338 length after conversion to the true value. (But decoding error
6339 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006340 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006341 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006342 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006344 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006345 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006346 end = s + size;
6347 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006348 unsigned char c;
6349 Py_UCS4 x;
6350 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006351 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 /* Non-escape characters are interpreted as Unicode ordinals */
6354 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006355 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6356 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006358 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 startinpos = s-starts;
6360
6361 /* \u-escapes are only interpreted iff the number of leading
6362 backslashes if odd */
6363 bs = s;
6364 for (;s < end;) {
6365 if (*s != '\\')
6366 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006367 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6368 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 }
6370 if (((s - bs) & 1) == 0 ||
6371 s >= end ||
6372 (*s != 'u' && *s != 'U')) {
6373 continue;
6374 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006375 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 count = *s=='u' ? 4 : 8;
6377 s++;
6378
6379 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 for (x = 0, i = 0; i < count; ++i, ++s) {
6381 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006382 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 endinpos = s-starts;
6384 if (unicode_decode_call_errorhandler(
6385 errors, &errorHandler,
6386 "rawunicodeescape", "truncated \\uXXXX",
6387 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006388 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 goto onError;
6390 goto nextByte;
6391 }
6392 x = (x<<4) & ~0xF;
6393 if (c >= '0' && c <= '9')
6394 x += c - '0';
6395 else if (c >= 'a' && c <= 'f')
6396 x += 10 + c - 'a';
6397 else
6398 x += 10 + c - 'A';
6399 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006400 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006401 if (unicode_putchar(&v, &outpos, x) < 0)
6402 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006403 } else {
6404 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006405 if (unicode_decode_call_errorhandler(
6406 errors, &errorHandler,
6407 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006408 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006409 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006411 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 nextByte:
6413 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006415 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006416 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006417 Py_XDECREF(errorHandler);
6418 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006419 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006420
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423 Py_XDECREF(errorHandler);
6424 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 return NULL;
6426}
6427
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006428
Alexander Belopolsky40018472011-02-26 01:02:56 +00006429PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006430PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006432 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 char *p;
6434 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006435 Py_ssize_t expandsize, pos;
6436 int kind;
6437 void *data;
6438 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006440 if (!PyUnicode_Check(unicode)) {
6441 PyErr_BadArgument();
6442 return NULL;
6443 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006444 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006445 return NULL;
6446 kind = PyUnicode_KIND(unicode);
6447 data = PyUnicode_DATA(unicode);
6448 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006449 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6450 bytes, and 1 byte characters 4. */
6451 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006452
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006453 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006454 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006455
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006456 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457 if (repr == NULL)
6458 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006459 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006460 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006462 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006463 for (pos = 0; pos < len; pos++) {
6464 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006465 /* Map 32-bit characters to '\Uxxxxxxxx' */
6466 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006467 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006468 *p++ = '\\';
6469 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006470 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6471 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6472 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6473 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6474 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6475 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6476 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6477 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006478 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006479 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006480 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 *p++ = '\\';
6482 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006483 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6484 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6485 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6486 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006488 /* Copy everything else as-is */
6489 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 *p++ = (char) ch;
6491 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006492
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006493 assert(p > q);
6494 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006495 return NULL;
6496 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497}
6498
Alexander Belopolsky40018472011-02-26 01:02:56 +00006499PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006500PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6501 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006503 PyObject *result;
6504 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6505 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006506 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006507 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6508 Py_DECREF(tmp);
6509 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510}
6511
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006512/* --- Unicode Internal Codec ------------------------------------------- */
6513
Alexander Belopolsky40018472011-02-26 01:02:56 +00006514PyObject *
6515_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006516 Py_ssize_t size,
6517 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006518{
6519 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006520 Py_ssize_t startinpos;
6521 Py_ssize_t endinpos;
6522 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006523 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006524 const char *end;
6525 const char *reason;
6526 PyObject *errorHandler = NULL;
6527 PyObject *exc = NULL;
6528
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006529 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006530 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006531 1))
6532 return NULL;
6533
Thomas Wouters89f507f2006-12-13 04:49:30 +00006534 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006535 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006536 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006538 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006539 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006540 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006541 end = s + size;
6542
6543 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006544 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006545 Py_UCS4 ch;
6546 /* We copy the raw representation one byte at a time because the
6547 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006548 ((char *) &uch)[0] = s[0];
6549 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006550#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006551 ((char *) &uch)[2] = s[2];
6552 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006553#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006554 ch = uch;
6555
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006556 /* We have to sanity check the raw data, otherwise doom looms for
6557 some malformed UCS-4 data. */
6558 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006559#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006560 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006561#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006562 end-s < Py_UNICODE_SIZE
6563 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006565 startinpos = s - starts;
6566 if (end-s < Py_UNICODE_SIZE) {
6567 endinpos = end-starts;
6568 reason = "truncated input";
6569 }
6570 else {
6571 endinpos = s - starts + Py_UNICODE_SIZE;
6572 reason = "illegal code point (> 0x10FFFF)";
6573 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006574 if (unicode_decode_call_errorhandler(
6575 errors, &errorHandler,
6576 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006577 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006578 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006579 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006580 continue;
6581 }
6582
6583 s += Py_UNICODE_SIZE;
6584#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006585 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006586 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006587 Py_UNICODE uch2;
6588 ((char *) &uch2)[0] = s[0];
6589 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006590 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006591 {
Victor Stinner551ac952011-11-29 22:58:13 +01006592 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006593 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006594 }
6595 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006596#endif
6597
6598 if (unicode_putchar(&v, &outpos, ch) < 0)
6599 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006600 }
6601
Victor Stinner16e6a802011-12-12 13:24:15 +01006602 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006603 goto onError;
6604 Py_XDECREF(errorHandler);
6605 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006606 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006607
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006609 Py_XDECREF(v);
6610 Py_XDECREF(errorHandler);
6611 Py_XDECREF(exc);
6612 return NULL;
6613}
6614
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615/* --- Latin-1 Codec ------------------------------------------------------ */
6616
Alexander Belopolsky40018472011-02-26 01:02:56 +00006617PyObject *
6618PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006619 Py_ssize_t size,
6620 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006623 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624}
6625
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006626/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006627static void
6628make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006629 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006630 PyObject *unicode,
6631 Py_ssize_t startpos, Py_ssize_t endpos,
6632 const char *reason)
6633{
6634 if (*exceptionObject == NULL) {
6635 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006636 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006637 encoding, unicode, startpos, endpos, reason);
6638 }
6639 else {
6640 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6641 goto onError;
6642 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6643 goto onError;
6644 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6645 goto onError;
6646 return;
6647 onError:
6648 Py_DECREF(*exceptionObject);
6649 *exceptionObject = NULL;
6650 }
6651}
6652
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006653/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006654static void
6655raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006656 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006657 PyObject *unicode,
6658 Py_ssize_t startpos, Py_ssize_t endpos,
6659 const char *reason)
6660{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006661 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006662 encoding, unicode, startpos, endpos, reason);
6663 if (*exceptionObject != NULL)
6664 PyCodec_StrictErrors(*exceptionObject);
6665}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006666
6667/* error handling callback helper:
6668 build arguments, call the callback and check the arguments,
6669 put the result into newpos and return the replacement string, which
6670 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006671static PyObject *
6672unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006673 PyObject **errorHandler,
6674 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006675 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006676 Py_ssize_t startpos, Py_ssize_t endpos,
6677 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006678{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006679 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006680 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006681 PyObject *restuple;
6682 PyObject *resunicode;
6683
6684 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006686 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006687 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006688 }
6689
Benjamin Petersonbac79492012-01-14 13:34:47 -05006690 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006691 return NULL;
6692 len = PyUnicode_GET_LENGTH(unicode);
6693
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006694 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006695 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006696 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006697 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006698
6699 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006700 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006701 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006702 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006703 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006704 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 Py_DECREF(restuple);
6706 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006708 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006709 &resunicode, newpos)) {
6710 Py_DECREF(restuple);
6711 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006712 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006713 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6714 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6715 Py_DECREF(restuple);
6716 return NULL;
6717 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006718 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006719 *newpos = len + *newpos;
6720 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006721 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6722 Py_DECREF(restuple);
6723 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006724 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725 Py_INCREF(resunicode);
6726 Py_DECREF(restuple);
6727 return resunicode;
6728}
6729
Alexander Belopolsky40018472011-02-26 01:02:56 +00006730static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006731unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006732 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006733 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006734{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006735 /* input state */
6736 Py_ssize_t pos=0, size;
6737 int kind;
6738 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006739 /* output object */
6740 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006741 /* pointer into the output */
6742 char *str;
6743 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006744 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006745 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6746 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006747 PyObject *errorHandler = NULL;
6748 PyObject *exc = NULL;
6749 /* the following variable is used for caching string comparisons
6750 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6751 int known_errorHandler = -1;
6752
Benjamin Petersonbac79492012-01-14 13:34:47 -05006753 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006754 return NULL;
6755 size = PyUnicode_GET_LENGTH(unicode);
6756 kind = PyUnicode_KIND(unicode);
6757 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006758 /* allocate enough for a simple encoding without
6759 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006760 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006761 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006762 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006763 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006764 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006765 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006766 ressize = size;
6767
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006768 while (pos < size) {
6769 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006770
Benjamin Peterson29060642009-01-31 22:14:21 +00006771 /* can we encode this? */
6772 if (c<limit) {
6773 /* no overflow check, because we know that the space is enough */
6774 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006775 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006776 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 Py_ssize_t requiredsize;
6779 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006780 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006782 Py_ssize_t collstart = pos;
6783 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006784 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006785 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006786 ++collend;
6787 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6788 if (known_errorHandler==-1) {
6789 if ((errors==NULL) || (!strcmp(errors, "strict")))
6790 known_errorHandler = 1;
6791 else if (!strcmp(errors, "replace"))
6792 known_errorHandler = 2;
6793 else if (!strcmp(errors, "ignore"))
6794 known_errorHandler = 3;
6795 else if (!strcmp(errors, "xmlcharrefreplace"))
6796 known_errorHandler = 4;
6797 else
6798 known_errorHandler = 0;
6799 }
6800 switch (known_errorHandler) {
6801 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006802 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006803 goto onError;
6804 case 2: /* replace */
6805 while (collstart++<collend)
6806 *str++ = '?'; /* fall through */
6807 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006808 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 break;
6810 case 4: /* xmlcharrefreplace */
6811 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006812 /* determine replacement size */
6813 for (i = collstart, repsize = 0; i < collend; ++i) {
6814 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6815 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006816 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006817 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006819 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006821 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006823 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006825 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006826 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006827 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006828 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006830 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006832 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006833 if (requiredsize > ressize) {
6834 if (requiredsize<2*ressize)
6835 requiredsize = 2*ressize;
6836 if (_PyBytes_Resize(&res, requiredsize))
6837 goto onError;
6838 str = PyBytes_AS_STRING(res) + respos;
6839 ressize = requiredsize;
6840 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006841 /* generate replacement */
6842 for (i = collstart; i < collend; ++i) {
6843 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006844 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006845 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006846 break;
6847 default:
6848 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006849 encoding, reason, unicode, &exc,
6850 collstart, collend, &newpos);
6851 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006852 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006853 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006854 if (PyBytes_Check(repunicode)) {
6855 /* Directly copy bytes result to output. */
6856 repsize = PyBytes_Size(repunicode);
6857 if (repsize > 1) {
6858 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006859 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006860 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6861 Py_DECREF(repunicode);
6862 goto onError;
6863 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006864 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006865 ressize += repsize-1;
6866 }
6867 memcpy(str, PyBytes_AsString(repunicode), repsize);
6868 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006869 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006870 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006871 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006872 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006873 /* need more space? (at least enough for what we
6874 have+the replacement+the rest of the string, so
6875 we won't have to check space for encodable characters) */
6876 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006877 repsize = PyUnicode_GET_LENGTH(repunicode);
6878 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 if (requiredsize > ressize) {
6880 if (requiredsize<2*ressize)
6881 requiredsize = 2*ressize;
6882 if (_PyBytes_Resize(&res, requiredsize)) {
6883 Py_DECREF(repunicode);
6884 goto onError;
6885 }
6886 str = PyBytes_AS_STRING(res) + respos;
6887 ressize = requiredsize;
6888 }
6889 /* check if there is anything unencodable in the replacement
6890 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006891 for (i = 0; repsize-->0; ++i, ++str) {
6892 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006894 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006895 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 Py_DECREF(repunicode);
6897 goto onError;
6898 }
6899 *str = (char)c;
6900 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006901 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006902 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006903 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006904 }
6905 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006906 /* Resize if we allocated to much */
6907 size = str - PyBytes_AS_STRING(res);
6908 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006909 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006910 if (_PyBytes_Resize(&res, size) < 0)
6911 goto onError;
6912 }
6913
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006914 Py_XDECREF(errorHandler);
6915 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006916 return res;
6917
6918 onError:
6919 Py_XDECREF(res);
6920 Py_XDECREF(errorHandler);
6921 Py_XDECREF(exc);
6922 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006923}
6924
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006925/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006926PyObject *
6927PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006928 Py_ssize_t size,
6929 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006931 PyObject *result;
6932 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6933 if (unicode == NULL)
6934 return NULL;
6935 result = unicode_encode_ucs1(unicode, errors, 256);
6936 Py_DECREF(unicode);
6937 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938}
6939
Alexander Belopolsky40018472011-02-26 01:02:56 +00006940PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006941_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942{
6943 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006944 PyErr_BadArgument();
6945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006946 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006947 if (PyUnicode_READY(unicode) == -1)
6948 return NULL;
6949 /* Fast path: if it is a one-byte string, construct
6950 bytes object directly. */
6951 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6952 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6953 PyUnicode_GET_LENGTH(unicode));
6954 /* Non-Latin-1 characters present. Defer to above function to
6955 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006956 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006957}
6958
6959PyObject*
6960PyUnicode_AsLatin1String(PyObject *unicode)
6961{
6962 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963}
6964
6965/* --- 7-bit ASCII Codec -------------------------------------------------- */
6966
Alexander Belopolsky40018472011-02-26 01:02:56 +00006967PyObject *
6968PyUnicode_DecodeASCII(const char *s,
6969 Py_ssize_t size,
6970 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006972 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006973 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006974 int kind;
6975 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006976 Py_ssize_t startinpos;
6977 Py_ssize_t endinpos;
6978 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006979 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006980 int has_error;
6981 const unsigned char *p = (const unsigned char *)s;
6982 const unsigned char *end = p + size;
6983 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006984 PyObject *errorHandler = NULL;
6985 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006986
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006987 if (size == 0) {
6988 Py_INCREF(unicode_empty);
6989 return unicode_empty;
6990 }
6991
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006993 if (size == 1 && (unsigned char)s[0] < 128)
6994 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006995
Victor Stinner702c7342011-10-05 13:50:52 +02006996 has_error = 0;
6997 while (p < end && !has_error) {
6998 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6999 an explanation. */
7000 if (!((size_t) p & LONG_PTR_MASK)) {
7001 /* Help register allocation */
7002 register const unsigned char *_p = p;
7003 while (_p < aligned_end) {
7004 unsigned long value = *(unsigned long *) _p;
7005 if (value & ASCII_CHAR_MASK) {
7006 has_error = 1;
7007 break;
7008 }
7009 _p += SIZEOF_LONG;
7010 }
7011 if (_p == end)
7012 break;
7013 if (has_error)
7014 break;
7015 p = _p;
7016 }
7017 if (*p & 0x80) {
7018 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007019 break;
Victor Stinner702c7342011-10-05 13:50:52 +02007020 }
7021 else {
7022 ++p;
7023 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007024 }
Victor Stinner702c7342011-10-05 13:50:52 +02007025 if (!has_error)
7026 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00007027
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007028 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007029 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007030 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007031 kind = PyUnicode_KIND(v);
7032 data = PyUnicode_DATA(v);
7033 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007034 e = s + size;
7035 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007036 register unsigned char c = (unsigned char)*s;
7037 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007038 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00007039 ++s;
7040 }
7041 else {
7042 startinpos = s-starts;
7043 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007044 if (unicode_decode_call_errorhandler(
7045 errors, &errorHandler,
7046 "ascii", "ordinal not in range(128)",
7047 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007048 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007050 kind = PyUnicode_KIND(v);
7051 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007054 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007055 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007056 Py_XDECREF(errorHandler);
7057 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007058 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007059 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007060
Benjamin Peterson29060642009-01-31 22:14:21 +00007061 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007063 Py_XDECREF(errorHandler);
7064 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007065 return NULL;
7066}
7067
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007068/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007069PyObject *
7070PyUnicode_EncodeASCII(const Py_UNICODE *p,
7071 Py_ssize_t size,
7072 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007073{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007074 PyObject *result;
7075 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7076 if (unicode == NULL)
7077 return NULL;
7078 result = unicode_encode_ucs1(unicode, errors, 128);
7079 Py_DECREF(unicode);
7080 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081}
7082
Alexander Belopolsky40018472011-02-26 01:02:56 +00007083PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007084_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085{
7086 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007087 PyErr_BadArgument();
7088 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007090 if (PyUnicode_READY(unicode) == -1)
7091 return NULL;
7092 /* Fast path: if it is an ASCII-only string, construct bytes object
7093 directly. Else defer to above function to raise the exception. */
7094 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
7095 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7096 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007097 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007098}
7099
7100PyObject *
7101PyUnicode_AsASCIIString(PyObject *unicode)
7102{
7103 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104}
7105
Victor Stinner99b95382011-07-04 14:23:54 +02007106#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007107
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007108/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007109
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007110#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007111#define NEED_RETRY
7112#endif
7113
Victor Stinner3a50e702011-10-18 21:21:00 +02007114#ifndef WC_ERR_INVALID_CHARS
7115# define WC_ERR_INVALID_CHARS 0x0080
7116#endif
7117
7118static char*
7119code_page_name(UINT code_page, PyObject **obj)
7120{
7121 *obj = NULL;
7122 if (code_page == CP_ACP)
7123 return "mbcs";
7124 if (code_page == CP_UTF7)
7125 return "CP_UTF7";
7126 if (code_page == CP_UTF8)
7127 return "CP_UTF8";
7128
7129 *obj = PyBytes_FromFormat("cp%u", code_page);
7130 if (*obj == NULL)
7131 return NULL;
7132 return PyBytes_AS_STRING(*obj);
7133}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007134
Alexander Belopolsky40018472011-02-26 01:02:56 +00007135static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007136is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137{
7138 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007140
Victor Stinner3a50e702011-10-18 21:21:00 +02007141 if (!IsDBCSLeadByteEx(code_page, *curr))
7142 return 0;
7143
7144 prev = CharPrevExA(code_page, s, curr, 0);
7145 if (prev == curr)
7146 return 1;
7147 /* FIXME: This code is limited to "true" double-byte encodings,
7148 as it assumes an incomplete character consists of a single
7149 byte. */
7150 if (curr - prev == 2)
7151 return 1;
7152 if (!IsDBCSLeadByteEx(code_page, *prev))
7153 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007154 return 0;
7155}
7156
Victor Stinner3a50e702011-10-18 21:21:00 +02007157static DWORD
7158decode_code_page_flags(UINT code_page)
7159{
7160 if (code_page == CP_UTF7) {
7161 /* The CP_UTF7 decoder only supports flags=0 */
7162 return 0;
7163 }
7164 else
7165 return MB_ERR_INVALID_CHARS;
7166}
7167
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007168/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007169 * Decode a byte string from a Windows code page into unicode object in strict
7170 * mode.
7171 *
7172 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7173 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007174 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007175static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007176decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007177 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007178 const char *in,
7179 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007180{
Victor Stinner3a50e702011-10-18 21:21:00 +02007181 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007182 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007183 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007184
7185 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007186 assert(insize > 0);
7187 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7188 if (outsize <= 0)
7189 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007190
7191 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007193 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007194 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007195 if (*v == NULL)
7196 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007198 }
7199 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007201 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007202 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007203 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007205 }
7206
7207 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007208 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7209 if (outsize <= 0)
7210 goto error;
7211 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007212
Victor Stinner3a50e702011-10-18 21:21:00 +02007213error:
7214 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7215 return -2;
7216 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007217 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007218}
7219
Victor Stinner3a50e702011-10-18 21:21:00 +02007220/*
7221 * Decode a byte string from a code page into unicode object with an error
7222 * handler.
7223 *
7224 * Returns consumed size if succeed, or raise a WindowsError or
7225 * UnicodeDecodeError exception and returns -1 on error.
7226 */
7227static int
7228decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007229 PyObject **v,
7230 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007231 const char *errors)
7232{
7233 const char *startin = in;
7234 const char *endin = in + size;
7235 const DWORD flags = decode_code_page_flags(code_page);
7236 /* Ideally, we should get reason from FormatMessage. This is the Windows
7237 2000 English version of the message. */
7238 const char *reason = "No mapping for the Unicode character exists "
7239 "in the target code page.";
7240 /* each step cannot decode more than 1 character, but a character can be
7241 represented as a surrogate pair */
7242 wchar_t buffer[2], *startout, *out;
7243 int insize, outsize;
7244 PyObject *errorHandler = NULL;
7245 PyObject *exc = NULL;
7246 PyObject *encoding_obj = NULL;
7247 char *encoding;
7248 DWORD err;
7249 int ret = -1;
7250
7251 assert(size > 0);
7252
7253 encoding = code_page_name(code_page, &encoding_obj);
7254 if (encoding == NULL)
7255 return -1;
7256
7257 if (errors == NULL || strcmp(errors, "strict") == 0) {
7258 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7259 UnicodeDecodeError. */
7260 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7261 if (exc != NULL) {
7262 PyCodec_StrictErrors(exc);
7263 Py_CLEAR(exc);
7264 }
7265 goto error;
7266 }
7267
7268 if (*v == NULL) {
7269 /* Create unicode object */
7270 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7271 PyErr_NoMemory();
7272 goto error;
7273 }
Victor Stinnerab595942011-12-17 04:59:06 +01007274 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007275 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 if (*v == NULL)
7277 goto error;
7278 startout = PyUnicode_AS_UNICODE(*v);
7279 }
7280 else {
7281 /* Extend unicode object */
7282 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7283 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7284 PyErr_NoMemory();
7285 goto error;
7286 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007287 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007288 goto error;
7289 startout = PyUnicode_AS_UNICODE(*v) + n;
7290 }
7291
7292 /* Decode the byte string character per character */
7293 out = startout;
7294 while (in < endin)
7295 {
7296 /* Decode a character */
7297 insize = 1;
7298 do
7299 {
7300 outsize = MultiByteToWideChar(code_page, flags,
7301 in, insize,
7302 buffer, Py_ARRAY_LENGTH(buffer));
7303 if (outsize > 0)
7304 break;
7305 err = GetLastError();
7306 if (err != ERROR_NO_UNICODE_TRANSLATION
7307 && err != ERROR_INSUFFICIENT_BUFFER)
7308 {
7309 PyErr_SetFromWindowsErr(0);
7310 goto error;
7311 }
7312 insize++;
7313 }
7314 /* 4=maximum length of a UTF-8 sequence */
7315 while (insize <= 4 && (in + insize) <= endin);
7316
7317 if (outsize <= 0) {
7318 Py_ssize_t startinpos, endinpos, outpos;
7319
7320 startinpos = in - startin;
7321 endinpos = startinpos + 1;
7322 outpos = out - PyUnicode_AS_UNICODE(*v);
7323 if (unicode_decode_call_errorhandler(
7324 errors, &errorHandler,
7325 encoding, reason,
7326 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007327 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007328 {
7329 goto error;
7330 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007331 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 }
7333 else {
7334 in += insize;
7335 memcpy(out, buffer, outsize * sizeof(wchar_t));
7336 out += outsize;
7337 }
7338 }
7339
7340 /* write a NUL character at the end */
7341 *out = 0;
7342
7343 /* Extend unicode object */
7344 outsize = out - startout;
7345 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007346 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007347 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007348 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007349
7350error:
7351 Py_XDECREF(encoding_obj);
7352 Py_XDECREF(errorHandler);
7353 Py_XDECREF(exc);
7354 return ret;
7355}
7356
Victor Stinner3a50e702011-10-18 21:21:00 +02007357static PyObject *
7358decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007359 const char *s, Py_ssize_t size,
7360 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007361{
Victor Stinner76a31a62011-11-04 00:05:13 +01007362 PyObject *v = NULL;
7363 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007364
Victor Stinner3a50e702011-10-18 21:21:00 +02007365 if (code_page < 0) {
7366 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7367 return NULL;
7368 }
7369
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007370 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007372
Victor Stinner76a31a62011-11-04 00:05:13 +01007373 do
7374 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007375#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007376 if (size > INT_MAX) {
7377 chunk_size = INT_MAX;
7378 final = 0;
7379 done = 0;
7380 }
7381 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007382#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007383 {
7384 chunk_size = (int)size;
7385 final = (consumed == NULL);
7386 done = 1;
7387 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007388
Victor Stinner76a31a62011-11-04 00:05:13 +01007389 /* Skip trailing lead-byte unless 'final' is set */
7390 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7391 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007392
Victor Stinner76a31a62011-11-04 00:05:13 +01007393 if (chunk_size == 0 && done) {
7394 if (v != NULL)
7395 break;
7396 Py_INCREF(unicode_empty);
7397 return unicode_empty;
7398 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007399
Victor Stinner76a31a62011-11-04 00:05:13 +01007400
7401 converted = decode_code_page_strict(code_page, &v,
7402 s, chunk_size);
7403 if (converted == -2)
7404 converted = decode_code_page_errors(code_page, &v,
7405 s, chunk_size,
7406 errors);
7407 assert(converted != 0);
7408
7409 if (converted < 0) {
7410 Py_XDECREF(v);
7411 return NULL;
7412 }
7413
7414 if (consumed)
7415 *consumed += converted;
7416
7417 s += converted;
7418 size -= converted;
7419 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007420
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007421 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007422}
7423
Alexander Belopolsky40018472011-02-26 01:02:56 +00007424PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007425PyUnicode_DecodeCodePageStateful(int code_page,
7426 const char *s,
7427 Py_ssize_t size,
7428 const char *errors,
7429 Py_ssize_t *consumed)
7430{
7431 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7432}
7433
7434PyObject *
7435PyUnicode_DecodeMBCSStateful(const char *s,
7436 Py_ssize_t size,
7437 const char *errors,
7438 Py_ssize_t *consumed)
7439{
7440 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7441}
7442
7443PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007444PyUnicode_DecodeMBCS(const char *s,
7445 Py_ssize_t size,
7446 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007447{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007448 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7449}
7450
Victor Stinner3a50e702011-10-18 21:21:00 +02007451static DWORD
7452encode_code_page_flags(UINT code_page, const char *errors)
7453{
7454 if (code_page == CP_UTF8) {
7455 if (winver.dwMajorVersion >= 6)
7456 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7457 and later */
7458 return WC_ERR_INVALID_CHARS;
7459 else
7460 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7461 return 0;
7462 }
7463 else if (code_page == CP_UTF7) {
7464 /* CP_UTF7 only supports flags=0 */
7465 return 0;
7466 }
7467 else {
7468 if (errors != NULL && strcmp(errors, "replace") == 0)
7469 return 0;
7470 else
7471 return WC_NO_BEST_FIT_CHARS;
7472 }
7473}
7474
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007475/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007476 * Encode a Unicode string to a Windows code page into a byte string in strict
7477 * mode.
7478 *
7479 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7480 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007481 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007482static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007483encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007484 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007486{
Victor Stinner554f3f02010-06-16 23:33:54 +00007487 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007488 BOOL *pusedDefaultChar = &usedDefaultChar;
7489 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007490 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007491 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007492 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007493 const DWORD flags = encode_code_page_flags(code_page, NULL);
7494 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007495 /* Create a substring so that we can get the UTF-16 representation
7496 of just the slice under consideration. */
7497 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007498
Martin v. Löwis3d325192011-11-04 18:23:06 +01007499 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007500
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007502 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007504 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007505
Victor Stinner2fc507f2011-11-04 20:06:39 +01007506 substring = PyUnicode_Substring(unicode, offset, offset+len);
7507 if (substring == NULL)
7508 return -1;
7509 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7510 if (p == NULL) {
7511 Py_DECREF(substring);
7512 return -1;
7513 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007514
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007515 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 outsize = WideCharToMultiByte(code_page, flags,
7517 p, size,
7518 NULL, 0,
7519 NULL, pusedDefaultChar);
7520 if (outsize <= 0)
7521 goto error;
7522 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007523 if (pusedDefaultChar && *pusedDefaultChar) {
7524 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007525 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007526 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007527
Victor Stinner3a50e702011-10-18 21:21:00 +02007528 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007530 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007531 if (*outbytes == NULL) {
7532 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007533 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007534 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007535 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007536 }
7537 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007539 const Py_ssize_t n = PyBytes_Size(*outbytes);
7540 if (outsize > PY_SSIZE_T_MAX - n) {
7541 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007542 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007543 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007544 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007545 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7546 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007547 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007548 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007549 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007550 }
7551
7552 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007553 outsize = WideCharToMultiByte(code_page, flags,
7554 p, size,
7555 out, outsize,
7556 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007557 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007558 if (outsize <= 0)
7559 goto error;
7560 if (pusedDefaultChar && *pusedDefaultChar)
7561 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007562 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007563
Victor Stinner3a50e702011-10-18 21:21:00 +02007564error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007565 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007566 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7567 return -2;
7568 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007569 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007570}
7571
Victor Stinner3a50e702011-10-18 21:21:00 +02007572/*
7573 * Encode a Unicode string to a Windows code page into a byte string using a
7574 * error handler.
7575 *
7576 * Returns consumed characters if succeed, or raise a WindowsError and returns
7577 * -1 on other error.
7578 */
7579static int
7580encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007581 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007582 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007583{
Victor Stinner3a50e702011-10-18 21:21:00 +02007584 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007585 Py_ssize_t pos = unicode_offset;
7586 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007587 /* Ideally, we should get reason from FormatMessage. This is the Windows
7588 2000 English version of the message. */
7589 const char *reason = "invalid character";
7590 /* 4=maximum length of a UTF-8 sequence */
7591 char buffer[4];
7592 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7593 Py_ssize_t outsize;
7594 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007595 PyObject *errorHandler = NULL;
7596 PyObject *exc = NULL;
7597 PyObject *encoding_obj = NULL;
7598 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007599 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007600 PyObject *rep;
7601 int ret = -1;
7602
7603 assert(insize > 0);
7604
7605 encoding = code_page_name(code_page, &encoding_obj);
7606 if (encoding == NULL)
7607 return -1;
7608
7609 if (errors == NULL || strcmp(errors, "strict") == 0) {
7610 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7611 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007612 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007613 if (exc != NULL) {
7614 PyCodec_StrictErrors(exc);
7615 Py_DECREF(exc);
7616 }
7617 Py_XDECREF(encoding_obj);
7618 return -1;
7619 }
7620
7621 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7622 pusedDefaultChar = &usedDefaultChar;
7623 else
7624 pusedDefaultChar = NULL;
7625
7626 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7627 PyErr_NoMemory();
7628 goto error;
7629 }
7630 outsize = insize * Py_ARRAY_LENGTH(buffer);
7631
7632 if (*outbytes == NULL) {
7633 /* Create string object */
7634 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7635 if (*outbytes == NULL)
7636 goto error;
7637 out = PyBytes_AS_STRING(*outbytes);
7638 }
7639 else {
7640 /* Extend string object */
7641 Py_ssize_t n = PyBytes_Size(*outbytes);
7642 if (n > PY_SSIZE_T_MAX - outsize) {
7643 PyErr_NoMemory();
7644 goto error;
7645 }
7646 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7647 goto error;
7648 out = PyBytes_AS_STRING(*outbytes) + n;
7649 }
7650
7651 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007652 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007653 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007654 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7655 wchar_t chars[2];
7656 int charsize;
7657 if (ch < 0x10000) {
7658 chars[0] = (wchar_t)ch;
7659 charsize = 1;
7660 }
7661 else {
7662 ch -= 0x10000;
7663 chars[0] = 0xd800 + (ch >> 10);
7664 chars[1] = 0xdc00 + (ch & 0x3ff);
7665 charsize = 2;
7666 }
7667
Victor Stinner3a50e702011-10-18 21:21:00 +02007668 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007669 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007670 buffer, Py_ARRAY_LENGTH(buffer),
7671 NULL, pusedDefaultChar);
7672 if (outsize > 0) {
7673 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7674 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007675 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007676 memcpy(out, buffer, outsize);
7677 out += outsize;
7678 continue;
7679 }
7680 }
7681 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7682 PyErr_SetFromWindowsErr(0);
7683 goto error;
7684 }
7685
Victor Stinner3a50e702011-10-18 21:21:00 +02007686 rep = unicode_encode_call_errorhandler(
7687 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007688 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007689 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007690 if (rep == NULL)
7691 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007692 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007693
7694 if (PyBytes_Check(rep)) {
7695 outsize = PyBytes_GET_SIZE(rep);
7696 if (outsize != 1) {
7697 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7698 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7699 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7700 Py_DECREF(rep);
7701 goto error;
7702 }
7703 out = PyBytes_AS_STRING(*outbytes) + offset;
7704 }
7705 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7706 out += outsize;
7707 }
7708 else {
7709 Py_ssize_t i;
7710 enum PyUnicode_Kind kind;
7711 void *data;
7712
Benjamin Petersonbac79492012-01-14 13:34:47 -05007713 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007714 Py_DECREF(rep);
7715 goto error;
7716 }
7717
7718 outsize = PyUnicode_GET_LENGTH(rep);
7719 if (outsize != 1) {
7720 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7721 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7722 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7723 Py_DECREF(rep);
7724 goto error;
7725 }
7726 out = PyBytes_AS_STRING(*outbytes) + offset;
7727 }
7728 kind = PyUnicode_KIND(rep);
7729 data = PyUnicode_DATA(rep);
7730 for (i=0; i < outsize; i++) {
7731 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7732 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007733 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007734 encoding, unicode,
7735 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007736 "unable to encode error handler result to ASCII");
7737 Py_DECREF(rep);
7738 goto error;
7739 }
7740 *out = (unsigned char)ch;
7741 out++;
7742 }
7743 }
7744 Py_DECREF(rep);
7745 }
7746 /* write a NUL byte */
7747 *out = 0;
7748 outsize = out - PyBytes_AS_STRING(*outbytes);
7749 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7750 if (_PyBytes_Resize(outbytes, outsize) < 0)
7751 goto error;
7752 ret = 0;
7753
7754error:
7755 Py_XDECREF(encoding_obj);
7756 Py_XDECREF(errorHandler);
7757 Py_XDECREF(exc);
7758 return ret;
7759}
7760
Victor Stinner3a50e702011-10-18 21:21:00 +02007761static PyObject *
7762encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007763 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007764 const char *errors)
7765{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007766 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007767 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007768 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007769 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007770
Benjamin Petersonbac79492012-01-14 13:34:47 -05007771 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007772 return NULL;
7773 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007774
Victor Stinner3a50e702011-10-18 21:21:00 +02007775 if (code_page < 0) {
7776 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7777 return NULL;
7778 }
7779
Martin v. Löwis3d325192011-11-04 18:23:06 +01007780 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007781 return PyBytes_FromStringAndSize(NULL, 0);
7782
Victor Stinner7581cef2011-11-03 22:32:33 +01007783 offset = 0;
7784 do
7785 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007786#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007787 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007788 chunks. */
7789 if (len > INT_MAX/2) {
7790 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007791 done = 0;
7792 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007793 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007794#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007795 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007796 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007797 done = 1;
7798 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007799
Victor Stinner76a31a62011-11-04 00:05:13 +01007800 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007801 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007802 errors);
7803 if (ret == -2)
7804 ret = encode_code_page_errors(code_page, &outbytes,
7805 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007806 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007807 if (ret < 0) {
7808 Py_XDECREF(outbytes);
7809 return NULL;
7810 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007811
Victor Stinner7581cef2011-11-03 22:32:33 +01007812 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007813 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007814 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007815
Victor Stinner3a50e702011-10-18 21:21:00 +02007816 return outbytes;
7817}
7818
7819PyObject *
7820PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7821 Py_ssize_t size,
7822 const char *errors)
7823{
Victor Stinner7581cef2011-11-03 22:32:33 +01007824 PyObject *unicode, *res;
7825 unicode = PyUnicode_FromUnicode(p, size);
7826 if (unicode == NULL)
7827 return NULL;
7828 res = encode_code_page(CP_ACP, unicode, errors);
7829 Py_DECREF(unicode);
7830 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007831}
7832
7833PyObject *
7834PyUnicode_EncodeCodePage(int code_page,
7835 PyObject *unicode,
7836 const char *errors)
7837{
Victor Stinner7581cef2011-11-03 22:32:33 +01007838 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007839}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007840
Alexander Belopolsky40018472011-02-26 01:02:56 +00007841PyObject *
7842PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007843{
7844 if (!PyUnicode_Check(unicode)) {
7845 PyErr_BadArgument();
7846 return NULL;
7847 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007848 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007849}
7850
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007851#undef NEED_RETRY
7852
Victor Stinner99b95382011-07-04 14:23:54 +02007853#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007854
Guido van Rossumd57fd912000-03-10 22:53:23 +00007855/* --- Character Mapping Codec -------------------------------------------- */
7856
Alexander Belopolsky40018472011-02-26 01:02:56 +00007857PyObject *
7858PyUnicode_DecodeCharmap(const char *s,
7859 Py_ssize_t size,
7860 PyObject *mapping,
7861 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007863 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007864 Py_ssize_t startinpos;
7865 Py_ssize_t endinpos;
7866 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007867 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007868 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007869 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007870 PyObject *errorHandler = NULL;
7871 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007872
Guido van Rossumd57fd912000-03-10 22:53:23 +00007873 /* Default to Latin-1 */
7874 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007877 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007879 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007880 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007881 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007882 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007883 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007884 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007885 Py_ssize_t maplen;
7886 enum PyUnicode_Kind kind;
7887 void *data;
7888 Py_UCS4 x;
7889
Benjamin Petersonbac79492012-01-14 13:34:47 -05007890 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007891 return NULL;
7892
7893 maplen = PyUnicode_GET_LENGTH(mapping);
7894 data = PyUnicode_DATA(mapping);
7895 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007896 while (s < e) {
7897 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007900 x = PyUnicode_READ(kind, data, ch);
7901 else
7902 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007904 if (x == 0xfffe)
7905 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 startinpos = s-starts;
7908 endinpos = startinpos+1;
7909 if (unicode_decode_call_errorhandler(
7910 errors, &errorHandler,
7911 "charmap", "character maps to <undefined>",
7912 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007913 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 goto onError;
7915 }
7916 continue;
7917 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007918
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007919 if (unicode_putchar(&v, &outpos, x) < 0)
7920 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007922 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007923 }
7924 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 while (s < e) {
7926 unsigned char ch = *s;
7927 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007928
Benjamin Peterson29060642009-01-31 22:14:21 +00007929 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7930 w = PyLong_FromLong((long)ch);
7931 if (w == NULL)
7932 goto onError;
7933 x = PyObject_GetItem(mapping, w);
7934 Py_DECREF(w);
7935 if (x == NULL) {
7936 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7937 /* No mapping found means: mapping is undefined. */
7938 PyErr_Clear();
7939 x = Py_None;
7940 Py_INCREF(x);
7941 } else
7942 goto onError;
7943 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007944
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 /* Apply mapping */
7946 if (PyLong_Check(x)) {
7947 long value = PyLong_AS_LONG(x);
7948 if (value < 0 || value > 65535) {
7949 PyErr_SetString(PyExc_TypeError,
7950 "character mapping must be in range(65536)");
7951 Py_DECREF(x);
7952 goto onError;
7953 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007954 if (unicode_putchar(&v, &outpos, value) < 0)
7955 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 }
7957 else if (x == Py_None) {
7958 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007959 startinpos = s-starts;
7960 endinpos = startinpos+1;
7961 if (unicode_decode_call_errorhandler(
7962 errors, &errorHandler,
7963 "charmap", "character maps to <undefined>",
7964 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007965 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 Py_DECREF(x);
7967 goto onError;
7968 }
7969 Py_DECREF(x);
7970 continue;
7971 }
7972 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007973 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007974
Benjamin Petersonbac79492012-01-14 13:34:47 -05007975 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007976 goto onError;
7977 targetsize = PyUnicode_GET_LENGTH(x);
7978
7979 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007981 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007982 PyUnicode_READ_CHAR(x, 0)) < 0)
7983 goto onError;
7984 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 else if (targetsize > 1) {
7986 /* 1-n mapping */
7987 if (targetsize > extrachars) {
7988 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 Py_ssize_t needed = (targetsize - extrachars) + \
7990 (targetsize << 2);
7991 extrachars += needed;
7992 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007993 if (unicode_resize(&v,
7994 PyUnicode_GET_LENGTH(v) + needed) < 0)
7995 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 Py_DECREF(x);
7997 goto onError;
7998 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007999 }
Victor Stinner1b487b42012-05-03 12:29:04 +02008000 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01008001 goto onError;
8002 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
8003 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 extrachars -= targetsize;
8005 }
8006 /* 1-0 mapping: skip the character */
8007 }
8008 else {
8009 /* wrong return value */
8010 PyErr_SetString(PyExc_TypeError,
8011 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008012 Py_DECREF(x);
8013 goto onError;
8014 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 Py_DECREF(x);
8016 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008017 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008018 }
Victor Stinner16e6a802011-12-12 13:24:15 +01008019 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01008020 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008021 Py_XDECREF(errorHandler);
8022 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008023 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00008024
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008026 Py_XDECREF(errorHandler);
8027 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 Py_XDECREF(v);
8029 return NULL;
8030}
8031
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008032/* Charmap encoding: the lookup table */
8033
Alexander Belopolsky40018472011-02-26 01:02:56 +00008034struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 PyObject_HEAD
8036 unsigned char level1[32];
8037 int count2, count3;
8038 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008039};
8040
8041static PyObject*
8042encoding_map_size(PyObject *obj, PyObject* args)
8043{
8044 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008045 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008046 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008047}
8048
8049static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008050 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 PyDoc_STR("Return the size (in bytes) of this object") },
8052 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008053};
8054
8055static void
8056encoding_map_dealloc(PyObject* o)
8057{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008058 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008059}
8060
8061static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008062 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 "EncodingMap", /*tp_name*/
8064 sizeof(struct encoding_map), /*tp_basicsize*/
8065 0, /*tp_itemsize*/
8066 /* methods */
8067 encoding_map_dealloc, /*tp_dealloc*/
8068 0, /*tp_print*/
8069 0, /*tp_getattr*/
8070 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008071 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 0, /*tp_repr*/
8073 0, /*tp_as_number*/
8074 0, /*tp_as_sequence*/
8075 0, /*tp_as_mapping*/
8076 0, /*tp_hash*/
8077 0, /*tp_call*/
8078 0, /*tp_str*/
8079 0, /*tp_getattro*/
8080 0, /*tp_setattro*/
8081 0, /*tp_as_buffer*/
8082 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8083 0, /*tp_doc*/
8084 0, /*tp_traverse*/
8085 0, /*tp_clear*/
8086 0, /*tp_richcompare*/
8087 0, /*tp_weaklistoffset*/
8088 0, /*tp_iter*/
8089 0, /*tp_iternext*/
8090 encoding_map_methods, /*tp_methods*/
8091 0, /*tp_members*/
8092 0, /*tp_getset*/
8093 0, /*tp_base*/
8094 0, /*tp_dict*/
8095 0, /*tp_descr_get*/
8096 0, /*tp_descr_set*/
8097 0, /*tp_dictoffset*/
8098 0, /*tp_init*/
8099 0, /*tp_alloc*/
8100 0, /*tp_new*/
8101 0, /*tp_free*/
8102 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008103};
8104
8105PyObject*
8106PyUnicode_BuildEncodingMap(PyObject* string)
8107{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008108 PyObject *result;
8109 struct encoding_map *mresult;
8110 int i;
8111 int need_dict = 0;
8112 unsigned char level1[32];
8113 unsigned char level2[512];
8114 unsigned char *mlevel1, *mlevel2, *mlevel3;
8115 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008116 int kind;
8117 void *data;
8118 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008120 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008121 PyErr_BadArgument();
8122 return NULL;
8123 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008124 kind = PyUnicode_KIND(string);
8125 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008126 memset(level1, 0xFF, sizeof level1);
8127 memset(level2, 0xFF, sizeof level2);
8128
8129 /* If there isn't a one-to-one mapping of NULL to \0,
8130 or if there are non-BMP characters, we need to use
8131 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008133 need_dict = 1;
8134 for (i = 1; i < 256; i++) {
8135 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008136 ch = PyUnicode_READ(kind, data, i);
8137 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008138 need_dict = 1;
8139 break;
8140 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008141 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008142 /* unmapped character */
8143 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008144 l1 = ch >> 11;
8145 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008146 if (level1[l1] == 0xFF)
8147 level1[l1] = count2++;
8148 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008149 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150 }
8151
8152 if (count2 >= 0xFF || count3 >= 0xFF)
8153 need_dict = 1;
8154
8155 if (need_dict) {
8156 PyObject *result = PyDict_New();
8157 PyObject *key, *value;
8158 if (!result)
8159 return NULL;
8160 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008161 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008162 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008163 if (!key || !value)
8164 goto failed1;
8165 if (PyDict_SetItem(result, key, value) == -1)
8166 goto failed1;
8167 Py_DECREF(key);
8168 Py_DECREF(value);
8169 }
8170 return result;
8171 failed1:
8172 Py_XDECREF(key);
8173 Py_XDECREF(value);
8174 Py_DECREF(result);
8175 return NULL;
8176 }
8177
8178 /* Create a three-level trie */
8179 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8180 16*count2 + 128*count3 - 1);
8181 if (!result)
8182 return PyErr_NoMemory();
8183 PyObject_Init(result, &EncodingMapType);
8184 mresult = (struct encoding_map*)result;
8185 mresult->count2 = count2;
8186 mresult->count3 = count3;
8187 mlevel1 = mresult->level1;
8188 mlevel2 = mresult->level23;
8189 mlevel3 = mresult->level23 + 16*count2;
8190 memcpy(mlevel1, level1, 32);
8191 memset(mlevel2, 0xFF, 16*count2);
8192 memset(mlevel3, 0, 128*count3);
8193 count3 = 0;
8194 for (i = 1; i < 256; i++) {
8195 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008196 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008197 /* unmapped character */
8198 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008199 o1 = PyUnicode_READ(kind, data, i)>>11;
8200 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008201 i2 = 16*mlevel1[o1] + o2;
8202 if (mlevel2[i2] == 0xFF)
8203 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008204 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008205 i3 = 128*mlevel2[i2] + o3;
8206 mlevel3[i3] = i;
8207 }
8208 return result;
8209}
8210
8211static int
Victor Stinner22168992011-11-20 17:09:18 +01008212encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008213{
8214 struct encoding_map *map = (struct encoding_map*)mapping;
8215 int l1 = c>>11;
8216 int l2 = (c>>7) & 0xF;
8217 int l3 = c & 0x7F;
8218 int i;
8219
Victor Stinner22168992011-11-20 17:09:18 +01008220 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008222 if (c == 0)
8223 return 0;
8224 /* level 1*/
8225 i = map->level1[l1];
8226 if (i == 0xFF) {
8227 return -1;
8228 }
8229 /* level 2*/
8230 i = map->level23[16*i+l2];
8231 if (i == 0xFF) {
8232 return -1;
8233 }
8234 /* level 3 */
8235 i = map->level23[16*map->count2 + 128*i + l3];
8236 if (i == 0) {
8237 return -1;
8238 }
8239 return i;
8240}
8241
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008242/* Lookup the character ch in the mapping. If the character
8243 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008244 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008245static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008246charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008247{
Christian Heimes217cfd12007-12-02 14:31:20 +00008248 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249 PyObject *x;
8250
8251 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008253 x = PyObject_GetItem(mapping, w);
8254 Py_DECREF(w);
8255 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8257 /* No mapping found means: mapping is undefined. */
8258 PyErr_Clear();
8259 x = Py_None;
8260 Py_INCREF(x);
8261 return x;
8262 } else
8263 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008264 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008265 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008267 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 long value = PyLong_AS_LONG(x);
8269 if (value < 0 || value > 255) {
8270 PyErr_SetString(PyExc_TypeError,
8271 "character mapping must be in range(256)");
8272 Py_DECREF(x);
8273 return NULL;
8274 }
8275 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008276 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008277 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008279 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 /* wrong return value */
8281 PyErr_Format(PyExc_TypeError,
8282 "character mapping must return integer, bytes or None, not %.400s",
8283 x->ob_type->tp_name);
8284 Py_DECREF(x);
8285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 }
8287}
8288
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008289static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008290charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008291{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008292 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8293 /* exponentially overallocate to minimize reallocations */
8294 if (requiredsize < 2*outsize)
8295 requiredsize = 2*outsize;
8296 if (_PyBytes_Resize(outobj, requiredsize))
8297 return -1;
8298 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008299}
8300
Benjamin Peterson14339b62009-01-31 16:36:08 +00008301typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008302 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008303} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008304/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008305 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306 space is available. Return a new reference to the object that
8307 was put in the output buffer, or Py_None, if the mapping was undefined
8308 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008309 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008310static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008311charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008312 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008313{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008314 PyObject *rep;
8315 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008316 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008317
Christian Heimes90aa7642007-12-19 02:45:37 +00008318 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008319 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008321 if (res == -1)
8322 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 if (outsize<requiredsize)
8324 if (charmapencode_resize(outobj, outpos, requiredsize))
8325 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008326 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 outstart[(*outpos)++] = (char)res;
8328 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008329 }
8330
8331 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008332 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008334 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 Py_DECREF(rep);
8336 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008337 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 if (PyLong_Check(rep)) {
8339 Py_ssize_t requiredsize = *outpos+1;
8340 if (outsize<requiredsize)
8341 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8342 Py_DECREF(rep);
8343 return enc_EXCEPTION;
8344 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008345 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008347 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 else {
8349 const char *repchars = PyBytes_AS_STRING(rep);
8350 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8351 Py_ssize_t requiredsize = *outpos+repsize;
8352 if (outsize<requiredsize)
8353 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8354 Py_DECREF(rep);
8355 return enc_EXCEPTION;
8356 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008357 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008358 memcpy(outstart + *outpos, repchars, repsize);
8359 *outpos += repsize;
8360 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008361 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008362 Py_DECREF(rep);
8363 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008364}
8365
8366/* handle an error in PyUnicode_EncodeCharmap
8367 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008368static int
8369charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008370 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008372 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008373 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374{
8375 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008376 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008377 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008378 enum PyUnicode_Kind kind;
8379 void *data;
8380 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008382 Py_ssize_t collstartpos = *inpos;
8383 Py_ssize_t collendpos = *inpos+1;
8384 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 char *encoding = "charmap";
8386 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008387 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008388 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008389 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390
Benjamin Petersonbac79492012-01-14 13:34:47 -05008391 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008392 return -1;
8393 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394 /* find all unencodable characters */
8395 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008396 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008397 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008398 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008399 val = encoding_map_lookup(ch, mapping);
8400 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 break;
8402 ++collendpos;
8403 continue;
8404 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008405
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008406 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8407 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 if (rep==NULL)
8409 return -1;
8410 else if (rep!=Py_None) {
8411 Py_DECREF(rep);
8412 break;
8413 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008414 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416 }
8417 /* cache callback name lookup
8418 * (if not done yet, i.e. it's the first error) */
8419 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 if ((errors==NULL) || (!strcmp(errors, "strict")))
8421 *known_errorHandler = 1;
8422 else if (!strcmp(errors, "replace"))
8423 *known_errorHandler = 2;
8424 else if (!strcmp(errors, "ignore"))
8425 *known_errorHandler = 3;
8426 else if (!strcmp(errors, "xmlcharrefreplace"))
8427 *known_errorHandler = 4;
8428 else
8429 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 }
8431 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008433 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008434 return -1;
8435 case 2: /* replace */
8436 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 x = charmapencode_output('?', mapping, res, respos);
8438 if (x==enc_EXCEPTION) {
8439 return -1;
8440 }
8441 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008442 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 return -1;
8444 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008445 }
8446 /* fall through */
8447 case 3: /* ignore */
8448 *inpos = collendpos;
8449 break;
8450 case 4: /* xmlcharrefreplace */
8451 /* generate replacement (temporarily (mis)uses p) */
8452 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 char buffer[2+29+1+1];
8454 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008455 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 for (cp = buffer; *cp; ++cp) {
8457 x = charmapencode_output(*cp, mapping, res, respos);
8458 if (x==enc_EXCEPTION)
8459 return -1;
8460 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008461 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 return -1;
8463 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008464 }
8465 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008466 *inpos = collendpos;
8467 break;
8468 default:
8469 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008470 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008472 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008474 if (PyBytes_Check(repunicode)) {
8475 /* Directly copy bytes result to output. */
8476 Py_ssize_t outsize = PyBytes_Size(*res);
8477 Py_ssize_t requiredsize;
8478 repsize = PyBytes_Size(repunicode);
8479 requiredsize = *respos + repsize;
8480 if (requiredsize > outsize)
8481 /* Make room for all additional bytes. */
8482 if (charmapencode_resize(res, respos, requiredsize)) {
8483 Py_DECREF(repunicode);
8484 return -1;
8485 }
8486 memcpy(PyBytes_AsString(*res) + *respos,
8487 PyBytes_AsString(repunicode), repsize);
8488 *respos += repsize;
8489 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008490 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008491 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008492 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008493 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008494 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008495 Py_DECREF(repunicode);
8496 return -1;
8497 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008498 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008499 data = PyUnicode_DATA(repunicode);
8500 kind = PyUnicode_KIND(repunicode);
8501 for (index = 0; index < repsize; index++) {
8502 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8503 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008505 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 return -1;
8507 }
8508 else if (x==enc_FAILED) {
8509 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008510 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 return -1;
8512 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008513 }
8514 *inpos = newpos;
8515 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008516 }
8517 return 0;
8518}
8519
Alexander Belopolsky40018472011-02-26 01:02:56 +00008520PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008521_PyUnicode_EncodeCharmap(PyObject *unicode,
8522 PyObject *mapping,
8523 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525 /* output object */
8526 PyObject *res = NULL;
8527 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008528 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008529 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008531 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008532 PyObject *errorHandler = NULL;
8533 PyObject *exc = NULL;
8534 /* the following variable is used for caching string comparisons
8535 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8536 * 3=ignore, 4=xmlcharrefreplace */
8537 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008538
Benjamin Petersonbac79492012-01-14 13:34:47 -05008539 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008540 return NULL;
8541 size = PyUnicode_GET_LENGTH(unicode);
8542
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543 /* Default to Latin-1 */
8544 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008545 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008547 /* allocate enough for a simple encoding without
8548 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008549 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008550 if (res == NULL)
8551 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008552 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008555 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008556 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008558 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 if (x==enc_EXCEPTION) /* error */
8560 goto onError;
8561 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008562 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 &exc,
8564 &known_errorHandler, &errorHandler, errors,
8565 &res, &respos)) {
8566 goto onError;
8567 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008568 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 else
8570 /* done with this character => adjust input position */
8571 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008573
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008575 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008576 if (_PyBytes_Resize(&res, respos) < 0)
8577 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008578
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008579 Py_XDECREF(exc);
8580 Py_XDECREF(errorHandler);
8581 return res;
8582
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008584 Py_XDECREF(res);
8585 Py_XDECREF(exc);
8586 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587 return NULL;
8588}
8589
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008590/* Deprecated */
8591PyObject *
8592PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8593 Py_ssize_t size,
8594 PyObject *mapping,
8595 const char *errors)
8596{
8597 PyObject *result;
8598 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8599 if (unicode == NULL)
8600 return NULL;
8601 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8602 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008603 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008604}
8605
Alexander Belopolsky40018472011-02-26 01:02:56 +00008606PyObject *
8607PyUnicode_AsCharmapString(PyObject *unicode,
8608 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609{
8610 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 PyErr_BadArgument();
8612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008614 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615}
8616
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008617/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008618static void
8619make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008620 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008621 Py_ssize_t startpos, Py_ssize_t endpos,
8622 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008624 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625 *exceptionObject = _PyUnicodeTranslateError_Create(
8626 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008627 }
8628 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8630 goto onError;
8631 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8632 goto onError;
8633 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8634 goto onError;
8635 return;
8636 onError:
8637 Py_DECREF(*exceptionObject);
8638 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639 }
8640}
8641
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008643static void
8644raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008645 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008646 Py_ssize_t startpos, Py_ssize_t endpos,
8647 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008648{
8649 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008651 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008653}
8654
8655/* error handling callback helper:
8656 build arguments, call the callback and check the arguments,
8657 put the result into newpos and return the replacement string, which
8658 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008659static PyObject *
8660unicode_translate_call_errorhandler(const char *errors,
8661 PyObject **errorHandler,
8662 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008664 Py_ssize_t startpos, Py_ssize_t endpos,
8665 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008667 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008669 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670 PyObject *restuple;
8671 PyObject *resunicode;
8672
8673 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008677 }
8678
8679 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683
8684 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008685 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008688 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008689 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 Py_DECREF(restuple);
8691 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008692 }
8693 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 &resunicode, &i_newpos)) {
8695 Py_DECREF(restuple);
8696 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008698 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008699 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008700 else
8701 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008702 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8704 Py_DECREF(restuple);
8705 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008706 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008707 Py_INCREF(resunicode);
8708 Py_DECREF(restuple);
8709 return resunicode;
8710}
8711
8712/* Lookup the character ch in the mapping and put the result in result,
8713 which must be decrefed by the caller.
8714 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008715static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008716charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008717{
Christian Heimes217cfd12007-12-02 14:31:20 +00008718 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008719 PyObject *x;
8720
8721 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008723 x = PyObject_GetItem(mapping, w);
8724 Py_DECREF(w);
8725 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8727 /* No mapping found means: use 1:1 mapping. */
8728 PyErr_Clear();
8729 *result = NULL;
8730 return 0;
8731 } else
8732 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008733 }
8734 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 *result = x;
8736 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008737 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008738 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 long value = PyLong_AS_LONG(x);
8740 long max = PyUnicode_GetMax();
8741 if (value < 0 || value > max) {
8742 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008743 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 Py_DECREF(x);
8745 return -1;
8746 }
8747 *result = x;
8748 return 0;
8749 }
8750 else if (PyUnicode_Check(x)) {
8751 *result = x;
8752 return 0;
8753 }
8754 else {
8755 /* wrong return value */
8756 PyErr_SetString(PyExc_TypeError,
8757 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008758 Py_DECREF(x);
8759 return -1;
8760 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008761}
8762/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 if not reallocate and adjust various state variables.
8764 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008765static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008766charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008768{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008769 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008770 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 /* exponentially overallocate to minimize reallocations */
8772 if (requiredsize < 2 * oldsize)
8773 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8775 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008778 }
8779 return 0;
8780}
8781/* lookup the character, put the result in the output string and adjust
8782 various state variables. Return a new reference to the object that
8783 was put in the output buffer in *result, or Py_None, if the mapping was
8784 undefined (in which case no character was written).
8785 The called must decref result.
8786 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008787static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8789 PyObject *mapping, Py_UCS4 **output,
8790 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008791 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008792{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008793 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8794 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008796 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008798 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008799 }
8800 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008802 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008804 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008805 }
8806 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008807 Py_ssize_t repsize;
8808 if (PyUnicode_READY(*res) == -1)
8809 return -1;
8810 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 if (repsize==1) {
8812 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008814 }
8815 else if (repsize!=0) {
8816 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817 Py_ssize_t requiredsize = *opos +
8818 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008819 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820 Py_ssize_t i;
8821 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008822 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008823 for(i = 0; i < repsize; i++)
8824 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008826 }
8827 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008828 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008829 return 0;
8830}
8831
Alexander Belopolsky40018472011-02-26 01:02:56 +00008832PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008833_PyUnicode_TranslateCharmap(PyObject *input,
8834 PyObject *mapping,
8835 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008836{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008837 /* input object */
8838 char *idata;
8839 Py_ssize_t size, i;
8840 int kind;
8841 /* output buffer */
8842 Py_UCS4 *output = NULL;
8843 Py_ssize_t osize;
8844 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008845 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008846 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008847 char *reason = "character maps to <undefined>";
8848 PyObject *errorHandler = NULL;
8849 PyObject *exc = NULL;
8850 /* the following variable is used for caching string comparisons
8851 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8852 * 3=ignore, 4=xmlcharrefreplace */
8853 int known_errorHandler = -1;
8854
Guido van Rossumd57fd912000-03-10 22:53:23 +00008855 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008856 PyErr_BadArgument();
8857 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008858 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008860 if (PyUnicode_READY(input) == -1)
8861 return NULL;
8862 idata = (char*)PyUnicode_DATA(input);
8863 kind = PyUnicode_KIND(input);
8864 size = PyUnicode_GET_LENGTH(input);
8865 i = 0;
8866
8867 if (size == 0) {
8868 Py_INCREF(input);
8869 return input;
8870 }
8871
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008872 /* allocate enough for a simple 1:1 translation without
8873 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 osize = size;
8875 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8876 opos = 0;
8877 if (output == NULL) {
8878 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008880 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 /* try to encode it */
8884 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885 if (charmaptranslate_output(input, i, mapping,
8886 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008887 Py_XDECREF(x);
8888 goto onError;
8889 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008890 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008891 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008892 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 else { /* untranslatable character */
8894 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8895 Py_ssize_t repsize;
8896 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008897 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008898 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008899 Py_ssize_t collstart = i;
8900 Py_ssize_t collend = i+1;
8901 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902
Benjamin Peterson29060642009-01-31 22:14:21 +00008903 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008904 while (collend < size) {
8905 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008906 goto onError;
8907 Py_XDECREF(x);
8908 if (x!=Py_None)
8909 break;
8910 ++collend;
8911 }
8912 /* cache callback name lookup
8913 * (if not done yet, i.e. it's the first error) */
8914 if (known_errorHandler==-1) {
8915 if ((errors==NULL) || (!strcmp(errors, "strict")))
8916 known_errorHandler = 1;
8917 else if (!strcmp(errors, "replace"))
8918 known_errorHandler = 2;
8919 else if (!strcmp(errors, "ignore"))
8920 known_errorHandler = 3;
8921 else if (!strcmp(errors, "xmlcharrefreplace"))
8922 known_errorHandler = 4;
8923 else
8924 known_errorHandler = 0;
8925 }
8926 switch (known_errorHandler) {
8927 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928 raise_translate_exception(&exc, input, collstart,
8929 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008930 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 case 2: /* replace */
8932 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933 for (coll = collstart; coll<collend; coll++)
8934 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 /* fall through */
8936 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 break;
8939 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940 /* generate replacement (temporarily (mis)uses i) */
8941 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 char buffer[2+29+1+1];
8943 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008944 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8945 if (charmaptranslate_makespace(&output, &osize,
8946 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008947 goto onError;
8948 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008952 break;
8953 default:
8954 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955 reason, input, &exc,
8956 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008957 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008958 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008959 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008960 Py_DECREF(repunicode);
8961 goto onError;
8962 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008963 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964 repsize = PyUnicode_GET_LENGTH(repunicode);
8965 if (charmaptranslate_makespace(&output, &osize,
8966 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008967 Py_DECREF(repunicode);
8968 goto onError;
8969 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970 for (uni2 = 0; repsize-->0; ++uni2)
8971 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8972 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008974 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008975 }
8976 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8978 if (!res)
8979 goto onError;
8980 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008981 Py_XDECREF(exc);
8982 Py_XDECREF(errorHandler);
8983 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984
Benjamin Peterson29060642009-01-31 22:14:21 +00008985 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008986 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008987 Py_XDECREF(exc);
8988 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989 return NULL;
8990}
8991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008992/* Deprecated. Use PyUnicode_Translate instead. */
8993PyObject *
8994PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8995 Py_ssize_t size,
8996 PyObject *mapping,
8997 const char *errors)
8998{
8999 PyObject *unicode = PyUnicode_FromUnicode(p, size);
9000 if (!unicode)
9001 return NULL;
9002 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9003}
9004
Alexander Belopolsky40018472011-02-26 01:02:56 +00009005PyObject *
9006PyUnicode_Translate(PyObject *str,
9007 PyObject *mapping,
9008 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009{
9010 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00009011
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012 str = PyUnicode_FromObject(str);
9013 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009014 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009015 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016 Py_DECREF(str);
9017 return result;
Tim Petersced69f82003-09-16 20:30:58 +00009018
Benjamin Peterson29060642009-01-31 22:14:21 +00009019 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020 Py_XDECREF(str);
9021 return NULL;
9022}
Tim Petersced69f82003-09-16 20:30:58 +00009023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009024static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009025fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026{
9027 /* No need to call PyUnicode_READY(self) because this function is only
9028 called as a callback from fixup() which does it already. */
9029 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9030 const int kind = PyUnicode_KIND(self);
9031 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009032 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009033 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034 Py_ssize_t i;
9035
9036 for (i = 0; i < len; ++i) {
9037 ch = PyUnicode_READ(kind, data, i);
9038 fixed = 0;
9039 if (ch > 127) {
9040 if (Py_UNICODE_ISSPACE(ch))
9041 fixed = ' ';
9042 else {
9043 const int decimal = Py_UNICODE_TODECIMAL(ch);
9044 if (decimal >= 0)
9045 fixed = '0' + decimal;
9046 }
9047 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009048 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02009049 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 PyUnicode_WRITE(kind, data, i, fixed);
9051 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009052 else
9053 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055 }
9056
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009057 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058}
9059
9060PyObject *
9061_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9062{
9063 if (!PyUnicode_Check(unicode)) {
9064 PyErr_BadInternalCall();
9065 return NULL;
9066 }
9067 if (PyUnicode_READY(unicode) == -1)
9068 return NULL;
9069 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9070 /* If the string is already ASCII, just return the same string */
9071 Py_INCREF(unicode);
9072 return unicode;
9073 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009074 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009075}
9076
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009077PyObject *
9078PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9079 Py_ssize_t length)
9080{
Victor Stinnerf0124502011-11-21 23:12:56 +01009081 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009082 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009083 Py_UCS4 maxchar;
9084 enum PyUnicode_Kind kind;
9085 void *data;
9086
Victor Stinner99d7ad02012-02-22 13:37:39 +01009087 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009088 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01009089 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009090 if (ch > 127) {
9091 int decimal = Py_UNICODE_TODECIMAL(ch);
9092 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009093 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02009094 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009095 }
9096 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009097
9098 /* Copy to a new string */
9099 decimal = PyUnicode_New(length, maxchar);
9100 if (decimal == NULL)
9101 return decimal;
9102 kind = PyUnicode_KIND(decimal);
9103 data = PyUnicode_DATA(decimal);
9104 /* Iterate over code points */
9105 for (i = 0; i < length; i++) {
9106 Py_UNICODE ch = s[i];
9107 if (ch > 127) {
9108 int decimal = Py_UNICODE_TODECIMAL(ch);
9109 if (decimal >= 0)
9110 ch = '0' + decimal;
9111 }
9112 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009113 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009114 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009115}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009116/* --- Decimal Encoder ---------------------------------------------------- */
9117
Alexander Belopolsky40018472011-02-26 01:02:56 +00009118int
9119PyUnicode_EncodeDecimal(Py_UNICODE *s,
9120 Py_ssize_t length,
9121 char *output,
9122 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009123{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009124 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009125 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009126 enum PyUnicode_Kind kind;
9127 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009128
9129 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009130 PyErr_BadArgument();
9131 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009132 }
9133
Victor Stinner42bf7752011-11-21 22:52:58 +01009134 unicode = PyUnicode_FromUnicode(s, length);
9135 if (unicode == NULL)
9136 return -1;
9137
Benjamin Petersonbac79492012-01-14 13:34:47 -05009138 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009139 Py_DECREF(unicode);
9140 return -1;
9141 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009142 kind = PyUnicode_KIND(unicode);
9143 data = PyUnicode_DATA(unicode);
9144
Victor Stinnerb84d7232011-11-22 01:50:07 +01009145 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009146 PyObject *exc;
9147 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009148 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009149 Py_ssize_t startpos;
9150
9151 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009152
Benjamin Peterson29060642009-01-31 22:14:21 +00009153 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009154 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009155 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009156 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009157 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009158 decimal = Py_UNICODE_TODECIMAL(ch);
9159 if (decimal >= 0) {
9160 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009161 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009162 continue;
9163 }
9164 if (0 < ch && ch < 256) {
9165 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009166 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 continue;
9168 }
Victor Stinner6345be92011-11-25 20:09:01 +01009169
Victor Stinner42bf7752011-11-21 22:52:58 +01009170 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009171 exc = NULL;
9172 raise_encode_exception(&exc, "decimal", unicode,
9173 startpos, startpos+1,
9174 "invalid decimal Unicode string");
9175 Py_XDECREF(exc);
9176 Py_DECREF(unicode);
9177 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009178 }
9179 /* 0-terminate the output string */
9180 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009181 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009182 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009183}
9184
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185/* --- Helpers ------------------------------------------------------------ */
9186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009188any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 Py_ssize_t start,
9190 Py_ssize_t end)
9191{
9192 int kind1, kind2, kind;
9193 void *buf1, *buf2;
9194 Py_ssize_t len1, len2, result;
9195
9196 kind1 = PyUnicode_KIND(s1);
9197 kind2 = PyUnicode_KIND(s2);
9198 kind = kind1 > kind2 ? kind1 : kind2;
9199 buf1 = PyUnicode_DATA(s1);
9200 buf2 = PyUnicode_DATA(s2);
9201 if (kind1 != kind)
9202 buf1 = _PyUnicode_AsKind(s1, kind);
9203 if (!buf1)
9204 return -2;
9205 if (kind2 != kind)
9206 buf2 = _PyUnicode_AsKind(s2, kind);
9207 if (!buf2) {
9208 if (kind1 != kind) PyMem_Free(buf1);
9209 return -2;
9210 }
9211 len1 = PyUnicode_GET_LENGTH(s1);
9212 len2 = PyUnicode_GET_LENGTH(s2);
9213
Victor Stinner794d5672011-10-10 03:21:36 +02009214 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009215 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009216 case PyUnicode_1BYTE_KIND:
9217 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9218 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9219 else
9220 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9221 break;
9222 case PyUnicode_2BYTE_KIND:
9223 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9224 break;
9225 case PyUnicode_4BYTE_KIND:
9226 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9227 break;
9228 default:
9229 assert(0); result = -2;
9230 }
9231 }
9232 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009233 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009234 case PyUnicode_1BYTE_KIND:
9235 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9236 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9237 else
9238 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9239 break;
9240 case PyUnicode_2BYTE_KIND:
9241 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9242 break;
9243 case PyUnicode_4BYTE_KIND:
9244 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9245 break;
9246 default:
9247 assert(0); result = -2;
9248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009249 }
9250
9251 if (kind1 != kind)
9252 PyMem_Free(buf1);
9253 if (kind2 != kind)
9254 PyMem_Free(buf2);
9255
9256 return result;
9257}
9258
9259Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009260_PyUnicode_InsertThousandsGrouping(
9261 PyObject *unicode, Py_ssize_t index,
9262 Py_ssize_t n_buffer,
9263 void *digits, Py_ssize_t n_digits,
9264 Py_ssize_t min_width,
9265 const char *grouping, PyObject *thousands_sep,
9266 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267{
Victor Stinner41a863c2012-02-24 00:37:51 +01009268 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009269 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009270 Py_ssize_t thousands_sep_len;
9271 Py_ssize_t len;
9272
9273 if (unicode != NULL) {
9274 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009275 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009276 }
9277 else {
9278 kind = PyUnicode_1BYTE_KIND;
9279 data = NULL;
9280 }
9281 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9282 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9283 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9284 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009285 if (thousands_sep_kind < kind) {
9286 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9287 if (!thousands_sep_data)
9288 return -1;
9289 }
9290 else {
9291 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9292 if (!data)
9293 return -1;
9294 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009295 }
9296
Benjamin Petersonead6b532011-12-20 17:23:42 -06009297 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009299 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009300 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009301 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009302 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009303 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009304 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009305 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009306 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009307 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009308 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009309 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009311 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009312 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009313 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009314 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009315 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009317 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009318 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009319 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009320 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009321 break;
9322 default:
9323 assert(0);
9324 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009325 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009326 if (unicode != NULL && thousands_sep_kind != kind) {
9327 if (thousands_sep_kind < kind)
9328 PyMem_Free(thousands_sep_data);
9329 else
9330 PyMem_Free(data);
9331 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009332 if (unicode == NULL) {
9333 *maxchar = 127;
9334 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009335 *maxchar = MAX_MAXCHAR(*maxchar,
9336 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009337 }
9338 }
9339 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009340}
9341
9342
Thomas Wouters477c8d52006-05-27 19:21:47 +00009343/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009344#define ADJUST_INDICES(start, end, len) \
9345 if (end > len) \
9346 end = len; \
9347 else if (end < 0) { \
9348 end += len; \
9349 if (end < 0) \
9350 end = 0; \
9351 } \
9352 if (start < 0) { \
9353 start += len; \
9354 if (start < 0) \
9355 start = 0; \
9356 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009357
Alexander Belopolsky40018472011-02-26 01:02:56 +00009358Py_ssize_t
9359PyUnicode_Count(PyObject *str,
9360 PyObject *substr,
9361 Py_ssize_t start,
9362 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009364 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009365 PyObject* str_obj;
9366 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367 int kind1, kind2, kind;
9368 void *buf1 = NULL, *buf2 = NULL;
9369 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009370
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009371 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009372 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009373 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009374 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009375 if (!sub_obj) {
9376 Py_DECREF(str_obj);
9377 return -1;
9378 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009379 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009380 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009381 Py_DECREF(str_obj);
9382 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383 }
Tim Petersced69f82003-09-16 20:30:58 +00009384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 kind1 = PyUnicode_KIND(str_obj);
9386 kind2 = PyUnicode_KIND(sub_obj);
9387 kind = kind1 > kind2 ? kind1 : kind2;
9388 buf1 = PyUnicode_DATA(str_obj);
9389 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009390 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391 if (!buf1)
9392 goto onError;
9393 buf2 = PyUnicode_DATA(sub_obj);
9394 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009395 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009396 if (!buf2)
9397 goto onError;
9398 len1 = PyUnicode_GET_LENGTH(str_obj);
9399 len2 = PyUnicode_GET_LENGTH(sub_obj);
9400
9401 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009402 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009404 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9405 result = asciilib_count(
9406 ((Py_UCS1*)buf1) + start, end - start,
9407 buf2, len2, PY_SSIZE_T_MAX
9408 );
9409 else
9410 result = ucs1lib_count(
9411 ((Py_UCS1*)buf1) + start, end - start,
9412 buf2, len2, PY_SSIZE_T_MAX
9413 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009414 break;
9415 case PyUnicode_2BYTE_KIND:
9416 result = ucs2lib_count(
9417 ((Py_UCS2*)buf1) + start, end - start,
9418 buf2, len2, PY_SSIZE_T_MAX
9419 );
9420 break;
9421 case PyUnicode_4BYTE_KIND:
9422 result = ucs4lib_count(
9423 ((Py_UCS4*)buf1) + start, end - start,
9424 buf2, len2, PY_SSIZE_T_MAX
9425 );
9426 break;
9427 default:
9428 assert(0); result = 0;
9429 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009430
9431 Py_DECREF(sub_obj);
9432 Py_DECREF(str_obj);
9433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434 if (kind1 != kind)
9435 PyMem_Free(buf1);
9436 if (kind2 != kind)
9437 PyMem_Free(buf2);
9438
Guido van Rossumd57fd912000-03-10 22:53:23 +00009439 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440 onError:
9441 Py_DECREF(sub_obj);
9442 Py_DECREF(str_obj);
9443 if (kind1 != kind && buf1)
9444 PyMem_Free(buf1);
9445 if (kind2 != kind && buf2)
9446 PyMem_Free(buf2);
9447 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448}
9449
Alexander Belopolsky40018472011-02-26 01:02:56 +00009450Py_ssize_t
9451PyUnicode_Find(PyObject *str,
9452 PyObject *sub,
9453 Py_ssize_t start,
9454 Py_ssize_t end,
9455 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009457 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009458
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009460 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009461 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009462 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009463 if (!sub) {
9464 Py_DECREF(str);
9465 return -2;
9466 }
9467 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9468 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009469 Py_DECREF(str);
9470 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471 }
Tim Petersced69f82003-09-16 20:30:58 +00009472
Victor Stinner794d5672011-10-10 03:21:36 +02009473 result = any_find_slice(direction,
9474 str, sub, start, end
9475 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009476
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009478 Py_DECREF(sub);
9479
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480 return result;
9481}
9482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009483Py_ssize_t
9484PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9485 Py_ssize_t start, Py_ssize_t end,
9486 int direction)
9487{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009489 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009490 if (PyUnicode_READY(str) == -1)
9491 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009492 if (start < 0 || end < 0) {
9493 PyErr_SetString(PyExc_IndexError, "string index out of range");
9494 return -2;
9495 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 if (end > PyUnicode_GET_LENGTH(str))
9497 end = PyUnicode_GET_LENGTH(str);
9498 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009499 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9500 kind, end-start, ch, direction);
9501 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009503 else
9504 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009505}
9506
Alexander Belopolsky40018472011-02-26 01:02:56 +00009507static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009508tailmatch(PyObject *self,
9509 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009510 Py_ssize_t start,
9511 Py_ssize_t end,
9512 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009513{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 int kind_self;
9515 int kind_sub;
9516 void *data_self;
9517 void *data_sub;
9518 Py_ssize_t offset;
9519 Py_ssize_t i;
9520 Py_ssize_t end_sub;
9521
9522 if (PyUnicode_READY(self) == -1 ||
9523 PyUnicode_READY(substring) == -1)
9524 return 0;
9525
9526 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009527 return 1;
9528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9530 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009531 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009532 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534 kind_self = PyUnicode_KIND(self);
9535 data_self = PyUnicode_DATA(self);
9536 kind_sub = PyUnicode_KIND(substring);
9537 data_sub = PyUnicode_DATA(substring);
9538 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9539
9540 if (direction > 0)
9541 offset = end;
9542 else
9543 offset = start;
9544
9545 if (PyUnicode_READ(kind_self, data_self, offset) ==
9546 PyUnicode_READ(kind_sub, data_sub, 0) &&
9547 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9548 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9549 /* If both are of the same kind, memcmp is sufficient */
9550 if (kind_self == kind_sub) {
9551 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009552 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009553 data_sub,
9554 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009555 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556 }
9557 /* otherwise we have to compare each character by first accesing it */
9558 else {
9559 /* We do not need to compare 0 and len(substring)-1 because
9560 the if statement above ensured already that they are equal
9561 when we end up here. */
9562 // TODO: honor direction and do a forward or backwards search
9563 for (i = 1; i < end_sub; ++i) {
9564 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9565 PyUnicode_READ(kind_sub, data_sub, i))
9566 return 0;
9567 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009568 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009569 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570 }
9571
9572 return 0;
9573}
9574
Alexander Belopolsky40018472011-02-26 01:02:56 +00009575Py_ssize_t
9576PyUnicode_Tailmatch(PyObject *str,
9577 PyObject *substr,
9578 Py_ssize_t start,
9579 Py_ssize_t end,
9580 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009582 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009583
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584 str = PyUnicode_FromObject(str);
9585 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009586 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009587 substr = PyUnicode_FromObject(substr);
9588 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009589 Py_DECREF(str);
9590 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009591 }
Tim Petersced69f82003-09-16 20:30:58 +00009592
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009593 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009594 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009595 Py_DECREF(str);
9596 Py_DECREF(substr);
9597 return result;
9598}
9599
Guido van Rossumd57fd912000-03-10 22:53:23 +00009600/* Apply fixfct filter to the Unicode object self and return a
9601 reference to the modified object */
9602
Alexander Belopolsky40018472011-02-26 01:02:56 +00009603static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009604fixup(PyObject *self,
9605 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009606{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 PyObject *u;
9608 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009609 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009610
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009611 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009612 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009613 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009614 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009616 /* fix functions return the new maximum character in a string,
9617 if the kind of the resulting unicode object does not change,
9618 everything is fine. Otherwise we need to change the string kind
9619 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009620 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009621
9622 if (maxchar_new == 0) {
9623 /* no changes */;
9624 if (PyUnicode_CheckExact(self)) {
9625 Py_DECREF(u);
9626 Py_INCREF(self);
9627 return self;
9628 }
9629 else
9630 return u;
9631 }
9632
Victor Stinnere6abb482012-05-02 01:15:40 +02009633 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634
Victor Stinnereaab6042011-12-11 22:22:39 +01009635 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009637
9638 /* In case the maximum character changed, we need to
9639 convert the string to the new category. */
9640 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9641 if (v == NULL) {
9642 Py_DECREF(u);
9643 return NULL;
9644 }
9645 if (maxchar_new > maxchar_old) {
9646 /* If the maxchar increased so that the kind changed, not all
9647 characters are representable anymore and we need to fix the
9648 string again. This only happens in very few cases. */
9649 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9650 maxchar_old = fixfct(v);
9651 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 }
9653 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009654 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009656 Py_DECREF(u);
9657 assert(_PyUnicode_CheckConsistency(v, 1));
9658 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009659}
9660
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009661static PyObject *
9662ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009663{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009664 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9665 char *resdata, *data = PyUnicode_DATA(self);
9666 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009667
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009668 res = PyUnicode_New(len, 127);
9669 if (res == NULL)
9670 return NULL;
9671 resdata = PyUnicode_DATA(res);
9672 if (lower)
9673 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009675 _Py_bytes_upper(resdata, data, len);
9676 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677}
9678
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009680handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009682 Py_ssize_t j;
9683 int final_sigma;
9684 Py_UCS4 c;
9685 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009686
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009687 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9688
9689 where ! is a negation and \p{xxx} is a character with property xxx.
9690 */
9691 for (j = i - 1; j >= 0; j--) {
9692 c = PyUnicode_READ(kind, data, j);
9693 if (!_PyUnicode_IsCaseIgnorable(c))
9694 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009696 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9697 if (final_sigma) {
9698 for (j = i + 1; j < length; j++) {
9699 c = PyUnicode_READ(kind, data, j);
9700 if (!_PyUnicode_IsCaseIgnorable(c))
9701 break;
9702 }
9703 final_sigma = j == length || !_PyUnicode_IsCased(c);
9704 }
9705 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009706}
9707
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009708static int
9709lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9710 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009712 /* Obscure special case. */
9713 if (c == 0x3A3) {
9714 mapped[0] = handle_capital_sigma(kind, data, length, i);
9715 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009716 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009717 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009718}
9719
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009720static Py_ssize_t
9721do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009722{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009723 Py_ssize_t i, k = 0;
9724 int n_res, j;
9725 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009726
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009727 c = PyUnicode_READ(kind, data, 0);
9728 n_res = _PyUnicode_ToUpperFull(c, mapped);
9729 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009730 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009731 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009732 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009733 for (i = 1; i < length; i++) {
9734 c = PyUnicode_READ(kind, data, i);
9735 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9736 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009737 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009738 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009739 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009740 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009741 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009742}
9743
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009744static Py_ssize_t
9745do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9746 Py_ssize_t i, k = 0;
9747
9748 for (i = 0; i < length; i++) {
9749 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9750 int n_res, j;
9751 if (Py_UNICODE_ISUPPER(c)) {
9752 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9753 }
9754 else if (Py_UNICODE_ISLOWER(c)) {
9755 n_res = _PyUnicode_ToUpperFull(c, mapped);
9756 }
9757 else {
9758 n_res = 1;
9759 mapped[0] = c;
9760 }
9761 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009762 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009763 res[k++] = mapped[j];
9764 }
9765 }
9766 return k;
9767}
9768
9769static Py_ssize_t
9770do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9771 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009772{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009773 Py_ssize_t i, k = 0;
9774
9775 for (i = 0; i < length; i++) {
9776 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9777 int n_res, j;
9778 if (lower)
9779 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9780 else
9781 n_res = _PyUnicode_ToUpperFull(c, mapped);
9782 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009783 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009784 res[k++] = mapped[j];
9785 }
9786 }
9787 return k;
9788}
9789
9790static Py_ssize_t
9791do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9792{
9793 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9794}
9795
9796static Py_ssize_t
9797do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9798{
9799 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9800}
9801
Benjamin Petersone51757f2012-01-12 21:10:29 -05009802static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009803do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9804{
9805 Py_ssize_t i, k = 0;
9806
9807 for (i = 0; i < length; i++) {
9808 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9809 Py_UCS4 mapped[3];
9810 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9811 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009812 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009813 res[k++] = mapped[j];
9814 }
9815 }
9816 return k;
9817}
9818
9819static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009820do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9821{
9822 Py_ssize_t i, k = 0;
9823 int previous_is_cased;
9824
9825 previous_is_cased = 0;
9826 for (i = 0; i < length; i++) {
9827 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9828 Py_UCS4 mapped[3];
9829 int n_res, j;
9830
9831 if (previous_is_cased)
9832 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9833 else
9834 n_res = _PyUnicode_ToTitleFull(c, mapped);
9835
9836 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009837 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009838 res[k++] = mapped[j];
9839 }
9840
9841 previous_is_cased = _PyUnicode_IsCased(c);
9842 }
9843 return k;
9844}
9845
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009846static PyObject *
9847case_operation(PyObject *self,
9848 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9849{
9850 PyObject *res = NULL;
9851 Py_ssize_t length, newlength = 0;
9852 int kind, outkind;
9853 void *data, *outdata;
9854 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9855
Benjamin Petersoneea48462012-01-16 14:28:50 -05009856 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009857
9858 kind = PyUnicode_KIND(self);
9859 data = PyUnicode_DATA(self);
9860 length = PyUnicode_GET_LENGTH(self);
9861 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9862 if (tmp == NULL)
9863 return PyErr_NoMemory();
9864 newlength = perform(kind, data, length, tmp, &maxchar);
9865 res = PyUnicode_New(newlength, maxchar);
9866 if (res == NULL)
9867 goto leave;
9868 tmpend = tmp + newlength;
9869 outdata = PyUnicode_DATA(res);
9870 outkind = PyUnicode_KIND(res);
9871 switch (outkind) {
9872 case PyUnicode_1BYTE_KIND:
9873 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9874 break;
9875 case PyUnicode_2BYTE_KIND:
9876 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9877 break;
9878 case PyUnicode_4BYTE_KIND:
9879 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9880 break;
9881 default:
9882 assert(0);
9883 break;
9884 }
9885 leave:
9886 PyMem_FREE(tmp);
9887 return res;
9888}
9889
Tim Peters8ce9f162004-08-27 01:49:32 +00009890PyObject *
9891PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009892{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009893 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009894 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009895 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009896 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009897 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9898 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009899 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009901 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009903 int use_memcpy;
9904 unsigned char *res_data = NULL, *sep_data = NULL;
9905 PyObject *last_obj;
9906 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907
Tim Peters05eba1f2004-08-27 21:32:02 +00009908 fseq = PySequence_Fast(seq, "");
9909 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009910 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009911 }
9912
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009913 /* NOTE: the following code can't call back into Python code,
9914 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009915 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009916
Tim Peters05eba1f2004-08-27 21:32:02 +00009917 seqlen = PySequence_Fast_GET_SIZE(fseq);
9918 /* If empty sequence, return u"". */
9919 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009920 Py_DECREF(fseq);
9921 Py_INCREF(unicode_empty);
9922 res = unicode_empty;
9923 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009924 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009925
Tim Peters05eba1f2004-08-27 21:32:02 +00009926 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009927 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009928 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009929 if (seqlen == 1) {
9930 if (PyUnicode_CheckExact(items[0])) {
9931 res = items[0];
9932 Py_INCREF(res);
9933 Py_DECREF(fseq);
9934 return res;
9935 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009936 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009937 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009938 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009939 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009940 /* Set up sep and seplen */
9941 if (separator == NULL) {
9942 /* fall back to a blank space separator */
9943 sep = PyUnicode_FromOrdinal(' ');
9944 if (!sep)
9945 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009946 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009947 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009948 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009949 else {
9950 if (!PyUnicode_Check(separator)) {
9951 PyErr_Format(PyExc_TypeError,
9952 "separator: expected str instance,"
9953 " %.80s found",
9954 Py_TYPE(separator)->tp_name);
9955 goto onError;
9956 }
9957 if (PyUnicode_READY(separator))
9958 goto onError;
9959 sep = separator;
9960 seplen = PyUnicode_GET_LENGTH(separator);
9961 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9962 /* inc refcount to keep this code path symmetric with the
9963 above case of a blank separator */
9964 Py_INCREF(sep);
9965 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009966 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009967 }
9968
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009969 /* There are at least two things to join, or else we have a subclass
9970 * of str in the sequence.
9971 * Do a pre-pass to figure out the total amount of space we'll
9972 * need (sz), and see whether all argument are strings.
9973 */
9974 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009975#ifdef Py_DEBUG
9976 use_memcpy = 0;
9977#else
9978 use_memcpy = 1;
9979#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009980 for (i = 0; i < seqlen; i++) {
9981 const Py_ssize_t old_sz = sz;
9982 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009983 if (!PyUnicode_Check(item)) {
9984 PyErr_Format(PyExc_TypeError,
9985 "sequence item %zd: expected str instance,"
9986 " %.80s found",
9987 i, Py_TYPE(item)->tp_name);
9988 goto onError;
9989 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990 if (PyUnicode_READY(item) == -1)
9991 goto onError;
9992 sz += PyUnicode_GET_LENGTH(item);
9993 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009994 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009995 if (i != 0)
9996 sz += seplen;
9997 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9998 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009999 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010000 goto onError;
10001 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010002 if (use_memcpy && last_obj != NULL) {
10003 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10004 use_memcpy = 0;
10005 }
10006 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010007 }
Tim Petersced69f82003-09-16 20:30:58 +000010008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010010 if (res == NULL)
10011 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010012
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010013 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010014#ifdef Py_DEBUG
10015 use_memcpy = 0;
10016#else
10017 if (use_memcpy) {
10018 res_data = PyUnicode_1BYTE_DATA(res);
10019 kind = PyUnicode_KIND(res);
10020 if (seplen != 0)
10021 sep_data = PyUnicode_1BYTE_DATA(sep);
10022 }
10023#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010025 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010026 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010027 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +020010028 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010029 if (use_memcpy) {
10030 Py_MEMCPY(res_data,
10031 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010032 kind * seplen);
10033 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010034 }
10035 else {
10036 copy_characters(res, res_offset, sep, 0, seplen);
10037 res_offset += seplen;
10038 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010039 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010040 itemlen = PyUnicode_GET_LENGTH(item);
10041 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010042 if (use_memcpy) {
10043 Py_MEMCPY(res_data,
10044 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010045 kind * itemlen);
10046 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010047 }
10048 else {
10049 copy_characters(res, res_offset, item, 0, itemlen);
10050 res_offset += itemlen;
10051 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010052 }
Tim Peters05eba1f2004-08-27 21:32:02 +000010053 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010054 if (use_memcpy)
10055 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010056 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +020010057 else
10058 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +000010059
Tim Peters05eba1f2004-08-27 21:32:02 +000010060 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010062 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010064
Benjamin Peterson29060642009-01-31 22:14:21 +000010065 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010066 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010068 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010069 return NULL;
10070}
10071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072#define FILL(kind, data, value, start, length) \
10073 do { \
10074 Py_ssize_t i_ = 0; \
10075 assert(kind != PyUnicode_WCHAR_KIND); \
10076 switch ((kind)) { \
10077 case PyUnicode_1BYTE_KIND: { \
10078 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010079 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 break; \
10081 } \
10082 case PyUnicode_2BYTE_KIND: { \
10083 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10084 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10085 break; \
10086 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010087 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10089 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10090 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010091 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 } \
10093 } \
10094 } while (0)
10095
Victor Stinner3fe55312012-01-04 00:33:50 +010010096Py_ssize_t
10097PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10098 Py_UCS4 fill_char)
10099{
10100 Py_ssize_t maxlen;
10101 enum PyUnicode_Kind kind;
10102 void *data;
10103
10104 if (!PyUnicode_Check(unicode)) {
10105 PyErr_BadInternalCall();
10106 return -1;
10107 }
10108 if (PyUnicode_READY(unicode) == -1)
10109 return -1;
10110 if (unicode_check_modifiable(unicode))
10111 return -1;
10112
10113 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10114 PyErr_SetString(PyExc_ValueError,
10115 "fill character is bigger than "
10116 "the string maximum character");
10117 return -1;
10118 }
10119
10120 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10121 length = Py_MIN(maxlen, length);
10122 if (length <= 0)
10123 return 0;
10124
10125 kind = PyUnicode_KIND(unicode);
10126 data = PyUnicode_DATA(unicode);
10127 FILL(kind, data, fill_char, start, length);
10128 return length;
10129}
10130
Victor Stinner9310abb2011-10-05 00:59:23 +020010131static PyObject *
10132pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010133 Py_ssize_t left,
10134 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010136{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 PyObject *u;
10138 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010139 int kind;
10140 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141
10142 if (left < 0)
10143 left = 0;
10144 if (right < 0)
10145 right = 0;
10146
Victor Stinnerc4b49542011-12-11 22:44:26 +010010147 if (left == 0 && right == 0)
10148 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10151 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010152 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10153 return NULL;
10154 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +020010156 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010158 if (!u)
10159 return NULL;
10160
10161 kind = PyUnicode_KIND(u);
10162 data = PyUnicode_DATA(u);
10163 if (left)
10164 FILL(kind, data, fill, 0, left);
10165 if (right)
10166 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010167 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010168 assert(_PyUnicode_CheckConsistency(u, 1));
10169 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170}
10171
Alexander Belopolsky40018472011-02-26 01:02:56 +000010172PyObject *
10173PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176
10177 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010178 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010179 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010180 if (PyUnicode_READY(string) == -1) {
10181 Py_DECREF(string);
10182 return NULL;
10183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184
Benjamin Petersonead6b532011-12-20 17:23:42 -060010185 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010187 if (PyUnicode_IS_ASCII(string))
10188 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010189 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010190 PyUnicode_GET_LENGTH(string), keepends);
10191 else
10192 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010193 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010194 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 break;
10196 case PyUnicode_2BYTE_KIND:
10197 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010198 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010199 PyUnicode_GET_LENGTH(string), keepends);
10200 break;
10201 case PyUnicode_4BYTE_KIND:
10202 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010203 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 PyUnicode_GET_LENGTH(string), keepends);
10205 break;
10206 default:
10207 assert(0);
10208 list = 0;
10209 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210 Py_DECREF(string);
10211 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212}
10213
Alexander Belopolsky40018472011-02-26 01:02:56 +000010214static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010215split(PyObject *self,
10216 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010217 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010218{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 int kind1, kind2, kind;
10220 void *buf1, *buf2;
10221 Py_ssize_t len1, len2;
10222 PyObject* out;
10223
Guido van Rossumd57fd912000-03-10 22:53:23 +000010224 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010225 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 if (PyUnicode_READY(self) == -1)
10228 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010231 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010233 if (PyUnicode_IS_ASCII(self))
10234 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010235 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010236 PyUnicode_GET_LENGTH(self), maxcount
10237 );
10238 else
10239 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010240 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010241 PyUnicode_GET_LENGTH(self), maxcount
10242 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 case PyUnicode_2BYTE_KIND:
10244 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010245 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 PyUnicode_GET_LENGTH(self), maxcount
10247 );
10248 case PyUnicode_4BYTE_KIND:
10249 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010250 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 PyUnicode_GET_LENGTH(self), maxcount
10252 );
10253 default:
10254 assert(0);
10255 return NULL;
10256 }
10257
10258 if (PyUnicode_READY(substring) == -1)
10259 return NULL;
10260
10261 kind1 = PyUnicode_KIND(self);
10262 kind2 = PyUnicode_KIND(substring);
10263 kind = kind1 > kind2 ? kind1 : kind2;
10264 buf1 = PyUnicode_DATA(self);
10265 buf2 = PyUnicode_DATA(substring);
10266 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010267 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 if (!buf1)
10269 return NULL;
10270 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010271 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 if (!buf2) {
10273 if (kind1 != kind) PyMem_Free(buf1);
10274 return NULL;
10275 }
10276 len1 = PyUnicode_GET_LENGTH(self);
10277 len2 = PyUnicode_GET_LENGTH(substring);
10278
Benjamin Petersonead6b532011-12-20 17:23:42 -060010279 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010281 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10282 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010283 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010284 else
10285 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010286 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 break;
10288 case PyUnicode_2BYTE_KIND:
10289 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010290 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 break;
10292 case PyUnicode_4BYTE_KIND:
10293 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010294 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 break;
10296 default:
10297 out = NULL;
10298 }
10299 if (kind1 != kind)
10300 PyMem_Free(buf1);
10301 if (kind2 != kind)
10302 PyMem_Free(buf2);
10303 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304}
10305
Alexander Belopolsky40018472011-02-26 01:02:56 +000010306static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010307rsplit(PyObject *self,
10308 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010309 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010310{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 int kind1, kind2, kind;
10312 void *buf1, *buf2;
10313 Py_ssize_t len1, len2;
10314 PyObject* out;
10315
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010316 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010317 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 if (PyUnicode_READY(self) == -1)
10320 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010323 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010325 if (PyUnicode_IS_ASCII(self))
10326 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010327 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010328 PyUnicode_GET_LENGTH(self), maxcount
10329 );
10330 else
10331 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010332 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010333 PyUnicode_GET_LENGTH(self), maxcount
10334 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 case PyUnicode_2BYTE_KIND:
10336 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010337 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 PyUnicode_GET_LENGTH(self), maxcount
10339 );
10340 case PyUnicode_4BYTE_KIND:
10341 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010342 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 PyUnicode_GET_LENGTH(self), maxcount
10344 );
10345 default:
10346 assert(0);
10347 return NULL;
10348 }
10349
10350 if (PyUnicode_READY(substring) == -1)
10351 return NULL;
10352
10353 kind1 = PyUnicode_KIND(self);
10354 kind2 = PyUnicode_KIND(substring);
10355 kind = kind1 > kind2 ? kind1 : kind2;
10356 buf1 = PyUnicode_DATA(self);
10357 buf2 = PyUnicode_DATA(substring);
10358 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010359 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 if (!buf1)
10361 return NULL;
10362 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010363 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 if (!buf2) {
10365 if (kind1 != kind) PyMem_Free(buf1);
10366 return NULL;
10367 }
10368 len1 = PyUnicode_GET_LENGTH(self);
10369 len2 = PyUnicode_GET_LENGTH(substring);
10370
Benjamin Petersonead6b532011-12-20 17:23:42 -060010371 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010373 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10374 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010375 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010376 else
10377 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010378 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 break;
10380 case PyUnicode_2BYTE_KIND:
10381 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010382 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 break;
10384 case PyUnicode_4BYTE_KIND:
10385 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010386 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 break;
10388 default:
10389 out = NULL;
10390 }
10391 if (kind1 != kind)
10392 PyMem_Free(buf1);
10393 if (kind2 != kind)
10394 PyMem_Free(buf2);
10395 return out;
10396}
10397
10398static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010399anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10400 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010402 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010404 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10405 return asciilib_find(buf1, len1, buf2, len2, offset);
10406 else
10407 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 case PyUnicode_2BYTE_KIND:
10409 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10410 case PyUnicode_4BYTE_KIND:
10411 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10412 }
10413 assert(0);
10414 return -1;
10415}
10416
10417static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010418anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10419 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010421 switch (kind) {
10422 case PyUnicode_1BYTE_KIND:
10423 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10424 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10425 else
10426 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10427 case PyUnicode_2BYTE_KIND:
10428 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10429 case PyUnicode_4BYTE_KIND:
10430 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10431 }
10432 assert(0);
10433 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010434}
10435
Alexander Belopolsky40018472011-02-26 01:02:56 +000010436static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437replace(PyObject *self, PyObject *str1,
10438 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010439{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 PyObject *u;
10441 char *sbuf = PyUnicode_DATA(self);
10442 char *buf1 = PyUnicode_DATA(str1);
10443 char *buf2 = PyUnicode_DATA(str2);
10444 int srelease = 0, release1 = 0, release2 = 0;
10445 int skind = PyUnicode_KIND(self);
10446 int kind1 = PyUnicode_KIND(str1);
10447 int kind2 = PyUnicode_KIND(str2);
10448 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10449 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10450 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010451 int mayshrink;
10452 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010453
10454 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010455 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010457 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010458
Victor Stinner59de0ee2011-10-07 10:01:28 +020010459 if (str1 == str2)
10460 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 if (skind < kind1)
10462 /* substring too wide to be present */
10463 goto nothing;
10464
Victor Stinner49a0a212011-10-12 23:46:10 +020010465 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10466 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10467 /* Replacing str1 with str2 may cause a maxchar reduction in the
10468 result string. */
10469 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +020010470 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010473 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010475 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010477 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010478 Py_UCS4 u1, u2;
10479 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010480 Py_ssize_t index, pos;
10481 char *src;
10482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010484 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10485 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010486 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010489 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010491 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010492 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010493
10494 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10495 index = 0;
10496 src = sbuf;
10497 while (--maxcount)
10498 {
10499 pos++;
10500 src += pos * PyUnicode_KIND(self);
10501 slen -= pos;
10502 index += pos;
10503 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10504 if (pos < 0)
10505 break;
10506 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10507 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010508 }
10509 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 int rkind = skind;
10511 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010512 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 if (kind1 < rkind) {
10515 /* widen substring */
10516 buf1 = _PyUnicode_AsKind(str1, rkind);
10517 if (!buf1) goto error;
10518 release1 = 1;
10519 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010520 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010521 if (i < 0)
10522 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 if (rkind > kind2) {
10524 /* widen replacement */
10525 buf2 = _PyUnicode_AsKind(str2, rkind);
10526 if (!buf2) goto error;
10527 release2 = 1;
10528 }
10529 else if (rkind < kind2) {
10530 /* widen self and buf1 */
10531 rkind = kind2;
10532 if (release1) PyMem_Free(buf1);
10533 sbuf = _PyUnicode_AsKind(self, rkind);
10534 if (!sbuf) goto error;
10535 srelease = 1;
10536 buf1 = _PyUnicode_AsKind(str1, rkind);
10537 if (!buf1) goto error;
10538 release1 = 1;
10539 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010540 u = PyUnicode_New(slen, maxchar);
10541 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010543 assert(PyUnicode_KIND(u) == rkind);
10544 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010545
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010546 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010547 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010548 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010550 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010552
10553 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010554 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010555 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010556 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010557 if (i == -1)
10558 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010559 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010561 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010564 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010565 }
10566 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 Py_ssize_t n, i, j, ires;
10568 Py_ssize_t product, new_size;
10569 int rkind = skind;
10570 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010573 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 buf1 = _PyUnicode_AsKind(str1, rkind);
10575 if (!buf1) goto error;
10576 release1 = 1;
10577 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010578 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010579 if (n == 0)
10580 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010582 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 buf2 = _PyUnicode_AsKind(str2, rkind);
10584 if (!buf2) goto error;
10585 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010588 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 rkind = kind2;
10590 sbuf = _PyUnicode_AsKind(self, rkind);
10591 if (!sbuf) goto error;
10592 srelease = 1;
10593 if (release1) PyMem_Free(buf1);
10594 buf1 = _PyUnicode_AsKind(str1, rkind);
10595 if (!buf1) goto error;
10596 release1 = 1;
10597 }
10598 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10599 PyUnicode_GET_LENGTH(str1))); */
10600 product = n * (len2-len1);
10601 if ((product / (len2-len1)) != n) {
10602 PyErr_SetString(PyExc_OverflowError,
10603 "replace string is too long");
10604 goto error;
10605 }
10606 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010607 if (new_size == 0) {
10608 Py_INCREF(unicode_empty);
10609 u = unicode_empty;
10610 goto done;
10611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10613 PyErr_SetString(PyExc_OverflowError,
10614 "replace string is too long");
10615 goto error;
10616 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010617 u = PyUnicode_New(new_size, maxchar);
10618 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010620 assert(PyUnicode_KIND(u) == rkind);
10621 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 ires = i = 0;
10623 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010624 while (n-- > 0) {
10625 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010626 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010627 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010628 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010629 if (j == -1)
10630 break;
10631 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010632 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010633 memcpy(res + rkind * ires,
10634 sbuf + rkind * i,
10635 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010637 }
10638 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010640 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010642 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010644 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010648 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010649 memcpy(res + rkind * ires,
10650 sbuf + rkind * i,
10651 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010652 }
10653 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010654 /* interleave */
10655 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010656 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010658 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010660 if (--n <= 0)
10661 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010662 memcpy(res + rkind * ires,
10663 sbuf + rkind * i,
10664 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 ires++;
10666 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010667 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010668 memcpy(res + rkind * ires,
10669 sbuf + rkind * i,
10670 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010671 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010672 }
10673
10674 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010675 unicode_adjust_maxchar(&u);
10676 if (u == NULL)
10677 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010679
10680 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 if (srelease)
10682 PyMem_FREE(sbuf);
10683 if (release1)
10684 PyMem_FREE(buf1);
10685 if (release2)
10686 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010687 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010689
Benjamin Peterson29060642009-01-31 22:14:21 +000010690 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010691 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 if (srelease)
10693 PyMem_FREE(sbuf);
10694 if (release1)
10695 PyMem_FREE(buf1);
10696 if (release2)
10697 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010698 return unicode_result_unchanged(self);
10699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 error:
10701 if (srelease && sbuf)
10702 PyMem_FREE(sbuf);
10703 if (release1 && buf1)
10704 PyMem_FREE(buf1);
10705 if (release2 && buf2)
10706 PyMem_FREE(buf2);
10707 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708}
10709
10710/* --- Unicode Object Methods --------------------------------------------- */
10711
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010712PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010713 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714\n\
10715Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010716characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717
10718static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010719unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010721 if (PyUnicode_READY(self) == -1)
10722 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010723 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724}
10725
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010726PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010727 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728\n\
10729Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010730have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731
10732static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010733unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010735 if (PyUnicode_READY(self) == -1)
10736 return NULL;
10737 if (PyUnicode_GET_LENGTH(self) == 0)
10738 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010739 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740}
10741
Benjamin Petersond5890c82012-01-14 13:23:30 -050010742PyDoc_STRVAR(casefold__doc__,
10743 "S.casefold() -> str\n\
10744\n\
10745Return a version of S suitable for caseless comparisons.");
10746
10747static PyObject *
10748unicode_casefold(PyObject *self)
10749{
10750 if (PyUnicode_READY(self) == -1)
10751 return NULL;
10752 if (PyUnicode_IS_ASCII(self))
10753 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010754 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010755}
10756
10757
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010758/* Argument converter. Coerces to a single unicode character */
10759
10760static int
10761convert_uc(PyObject *obj, void *addr)
10762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010764 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010765
Benjamin Peterson14339b62009-01-31 16:36:08 +000010766 uniobj = PyUnicode_FromObject(obj);
10767 if (uniobj == NULL) {
10768 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010769 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010770 return 0;
10771 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010773 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010774 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010775 Py_DECREF(uniobj);
10776 return 0;
10777 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010779 Py_DECREF(uniobj);
10780 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010781}
10782
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010783PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010784 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010786Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010787done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010788
10789static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010790unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010791{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010792 Py_ssize_t marg, left;
10793 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 Py_UCS4 fillchar = ' ';
10795
Victor Stinnere9a29352011-10-01 02:14:59 +020010796 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798
Benjamin Petersonbac79492012-01-14 13:34:47 -050010799 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010800 return NULL;
10801
Victor Stinnerc4b49542011-12-11 22:44:26 +010010802 if (PyUnicode_GET_LENGTH(self) >= width)
10803 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010804
Victor Stinnerc4b49542011-12-11 22:44:26 +010010805 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010806 left = marg / 2 + (marg & width & 1);
10807
Victor Stinner9310abb2011-10-05 00:59:23 +020010808 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809}
10810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811/* This function assumes that str1 and str2 are readied by the caller. */
10812
Marc-André Lemburge5034372000-08-08 08:04:29 +000010813static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010814unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010815{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 int kind1, kind2;
10817 void *data1, *data2;
10818 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820 kind1 = PyUnicode_KIND(str1);
10821 kind2 = PyUnicode_KIND(str2);
10822 data1 = PyUnicode_DATA(str1);
10823 data2 = PyUnicode_DATA(str2);
10824 len1 = PyUnicode_GET_LENGTH(str1);
10825 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827 for (i = 0; i < len1 && i < len2; ++i) {
10828 Py_UCS4 c1, c2;
10829 c1 = PyUnicode_READ(kind1, data1, i);
10830 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010831
10832 if (c1 != c2)
10833 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010834 }
10835
10836 return (len1 < len2) ? -1 : (len1 != len2);
10837}
10838
Alexander Belopolsky40018472011-02-26 01:02:56 +000010839int
10840PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10843 if (PyUnicode_READY(left) == -1 ||
10844 PyUnicode_READY(right) == -1)
10845 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010846 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010847 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010848 PyErr_Format(PyExc_TypeError,
10849 "Can't compare %.100s and %.100s",
10850 left->ob_type->tp_name,
10851 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852 return -1;
10853}
10854
Martin v. Löwis5b222132007-06-10 09:51:05 +000010855int
10856PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10857{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 Py_ssize_t i;
10859 int kind;
10860 void *data;
10861 Py_UCS4 chr;
10862
Victor Stinner910337b2011-10-03 03:20:16 +020010863 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864 if (PyUnicode_READY(uni) == -1)
10865 return -1;
10866 kind = PyUnicode_KIND(uni);
10867 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010868 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10870 if (chr != str[i])
10871 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010872 /* This check keeps Python strings that end in '\0' from comparing equal
10873 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010875 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010876 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010877 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010878 return 0;
10879}
10880
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010881
Benjamin Peterson29060642009-01-31 22:14:21 +000010882#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010883 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010884
Alexander Belopolsky40018472011-02-26 01:02:56 +000010885PyObject *
10886PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010887{
10888 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010889
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010890 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10891 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010892 if (PyUnicode_READY(left) == -1 ||
10893 PyUnicode_READY(right) == -1)
10894 return NULL;
10895 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10896 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010897 if (op == Py_EQ) {
10898 Py_INCREF(Py_False);
10899 return Py_False;
10900 }
10901 if (op == Py_NE) {
10902 Py_INCREF(Py_True);
10903 return Py_True;
10904 }
10905 }
10906 if (left == right)
10907 result = 0;
10908 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010909 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010910
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010911 /* Convert the return value to a Boolean */
10912 switch (op) {
10913 case Py_EQ:
10914 v = TEST_COND(result == 0);
10915 break;
10916 case Py_NE:
10917 v = TEST_COND(result != 0);
10918 break;
10919 case Py_LE:
10920 v = TEST_COND(result <= 0);
10921 break;
10922 case Py_GE:
10923 v = TEST_COND(result >= 0);
10924 break;
10925 case Py_LT:
10926 v = TEST_COND(result == -1);
10927 break;
10928 case Py_GT:
10929 v = TEST_COND(result == 1);
10930 break;
10931 default:
10932 PyErr_BadArgument();
10933 return NULL;
10934 }
10935 Py_INCREF(v);
10936 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010937 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010938
Brian Curtindfc80e32011-08-10 20:28:54 -050010939 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010940}
10941
Alexander Belopolsky40018472011-02-26 01:02:56 +000010942int
10943PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010944{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010945 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946 int kind1, kind2, kind;
10947 void *buf1, *buf2;
10948 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010949 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010950
10951 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010952 sub = PyUnicode_FromObject(element);
10953 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010954 PyErr_Format(PyExc_TypeError,
10955 "'in <string>' requires string as left operand, not %s",
10956 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010957 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010958 }
10959
Thomas Wouters477c8d52006-05-27 19:21:47 +000010960 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010961 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010962 Py_DECREF(sub);
10963 return -1;
10964 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010965 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10966 Py_DECREF(sub);
10967 Py_DECREF(str);
10968 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 kind1 = PyUnicode_KIND(str);
10971 kind2 = PyUnicode_KIND(sub);
10972 kind = kind1 > kind2 ? kind1 : kind2;
10973 buf1 = PyUnicode_DATA(str);
10974 buf2 = PyUnicode_DATA(sub);
10975 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010976 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010977 if (!buf1) {
10978 Py_DECREF(sub);
10979 return -1;
10980 }
10981 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010982 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010983 if (!buf2) {
10984 Py_DECREF(sub);
10985 if (kind1 != kind) PyMem_Free(buf1);
10986 return -1;
10987 }
10988 len1 = PyUnicode_GET_LENGTH(str);
10989 len2 = PyUnicode_GET_LENGTH(sub);
10990
Benjamin Petersonead6b532011-12-20 17:23:42 -060010991 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 case PyUnicode_1BYTE_KIND:
10993 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10994 break;
10995 case PyUnicode_2BYTE_KIND:
10996 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10997 break;
10998 case PyUnicode_4BYTE_KIND:
10999 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11000 break;
11001 default:
11002 result = -1;
11003 assert(0);
11004 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011005
11006 Py_DECREF(str);
11007 Py_DECREF(sub);
11008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 if (kind1 != kind)
11010 PyMem_Free(buf1);
11011 if (kind2 != kind)
11012 PyMem_Free(buf2);
11013
Guido van Rossum403d68b2000-03-13 15:55:09 +000011014 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011015}
11016
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017/* Concat to string or Unicode object giving a new Unicode object. */
11018
Alexander Belopolsky40018472011-02-26 01:02:56 +000011019PyObject *
11020PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020011023 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010011024 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011025
11026 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011029 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011032 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033
11034 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011035 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011036 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011038 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011039 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011040 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011041 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042 }
11043
Victor Stinner488fa492011-12-12 00:01:39 +010011044 u_len = PyUnicode_GET_LENGTH(u);
11045 v_len = PyUnicode_GET_LENGTH(v);
11046 if (u_len > PY_SSIZE_T_MAX - v_len) {
11047 PyErr_SetString(PyExc_OverflowError,
11048 "strings are too large to concat");
11049 goto onError;
11050 }
11051 new_len = u_len + v_len;
11052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011054 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020011055 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011056
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011058 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011060 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010011061 copy_characters(w, 0, u, 0, u_len);
11062 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063 Py_DECREF(u);
11064 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011065 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011066 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067
Benjamin Peterson29060642009-01-31 22:14:21 +000011068 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069 Py_XDECREF(u);
11070 Py_XDECREF(v);
11071 return NULL;
11072}
11073
Walter Dörwald1ab83302007-05-18 17:15:44 +000011074void
Victor Stinner23e56682011-10-03 03:54:37 +020011075PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011076{
Victor Stinner23e56682011-10-03 03:54:37 +020011077 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011078 Py_UCS4 maxchar, maxchar2;
11079 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011080
11081 if (p_left == NULL) {
11082 if (!PyErr_Occurred())
11083 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011084 return;
11085 }
Victor Stinner23e56682011-10-03 03:54:37 +020011086 left = *p_left;
11087 if (right == NULL || !PyUnicode_Check(left)) {
11088 if (!PyErr_Occurred())
11089 PyErr_BadInternalCall();
11090 goto error;
11091 }
11092
Benjamin Petersonbac79492012-01-14 13:34:47 -050011093 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011094 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011095 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011096 goto error;
11097
Victor Stinner488fa492011-12-12 00:01:39 +010011098 /* Shortcuts */
11099 if (left == unicode_empty) {
11100 Py_DECREF(left);
11101 Py_INCREF(right);
11102 *p_left = right;
11103 return;
11104 }
11105 if (right == unicode_empty)
11106 return;
11107
11108 left_len = PyUnicode_GET_LENGTH(left);
11109 right_len = PyUnicode_GET_LENGTH(right);
11110 if (left_len > PY_SSIZE_T_MAX - right_len) {
11111 PyErr_SetString(PyExc_OverflowError,
11112 "strings are too large to concat");
11113 goto error;
11114 }
11115 new_len = left_len + right_len;
11116
11117 if (unicode_modifiable(left)
11118 && PyUnicode_CheckExact(right)
11119 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011120 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11121 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011122 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011123 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011124 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11125 {
11126 /* append inplace */
11127 if (unicode_resize(p_left, new_len) != 0) {
11128 /* XXX if _PyUnicode_Resize() fails, 'left' has been
11129 * deallocated so it cannot be put back into
11130 * 'variable'. The MemoryError is raised when there
11131 * is no value in 'variable', which might (very
11132 * remotely) be a cause of incompatibilities.
11133 */
11134 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020011135 }
Victor Stinner488fa492011-12-12 00:01:39 +010011136 /* copy 'right' into the newly allocated area of 'left' */
11137 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011138 }
Victor Stinner488fa492011-12-12 00:01:39 +010011139 else {
11140 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11141 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020011142 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011143
Victor Stinner488fa492011-12-12 00:01:39 +010011144 /* Concat the two Unicode strings */
11145 res = PyUnicode_New(new_len, maxchar);
11146 if (res == NULL)
11147 goto error;
11148 copy_characters(res, 0, left, 0, left_len);
11149 copy_characters(res, left_len, right, 0, right_len);
11150 Py_DECREF(left);
11151 *p_left = res;
11152 }
11153 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011154 return;
11155
11156error:
Victor Stinner488fa492011-12-12 00:01:39 +010011157 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011158}
11159
11160void
11161PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11162{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011163 PyUnicode_Append(pleft, right);
11164 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011165}
11166
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011167PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011168 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011170Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011171string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011172interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173
11174static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011175unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011177 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011178 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011179 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 int kind1, kind2, kind;
11182 void *buf1, *buf2;
11183 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184
Jesus Ceaac451502011-04-20 17:09:23 +020011185 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11186 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011187 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011189 kind1 = PyUnicode_KIND(self);
11190 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011191 if (kind2 > kind1)
11192 return PyLong_FromLong(0);
11193 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011194 buf1 = PyUnicode_DATA(self);
11195 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011197 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011198 if (!buf2) {
11199 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 return NULL;
11201 }
11202 len1 = PyUnicode_GET_LENGTH(self);
11203 len2 = PyUnicode_GET_LENGTH(substring);
11204
11205 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011206 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207 case PyUnicode_1BYTE_KIND:
11208 iresult = ucs1lib_count(
11209 ((Py_UCS1*)buf1) + start, end - start,
11210 buf2, len2, PY_SSIZE_T_MAX
11211 );
11212 break;
11213 case PyUnicode_2BYTE_KIND:
11214 iresult = ucs2lib_count(
11215 ((Py_UCS2*)buf1) + start, end - start,
11216 buf2, len2, PY_SSIZE_T_MAX
11217 );
11218 break;
11219 case PyUnicode_4BYTE_KIND:
11220 iresult = ucs4lib_count(
11221 ((Py_UCS4*)buf1) + start, end - start,
11222 buf2, len2, PY_SSIZE_T_MAX
11223 );
11224 break;
11225 default:
11226 assert(0); iresult = 0;
11227 }
11228
11229 result = PyLong_FromSsize_t(iresult);
11230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 if (kind2 != kind)
11232 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233
11234 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011235
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236 return result;
11237}
11238
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011239PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011240 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011242Encode S using the codec registered for encoding. Default encoding\n\
11243is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011244handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011245a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11246'xmlcharrefreplace' as well as any other name registered with\n\
11247codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248
11249static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011250unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011252 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253 char *encoding = NULL;
11254 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011255
Benjamin Peterson308d6372009-09-18 21:42:35 +000011256 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11257 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011259 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011260}
11261
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011262PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011263 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264\n\
11265Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011266If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267
11268static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011269unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011271 Py_ssize_t i, j, line_pos, src_len, incr;
11272 Py_UCS4 ch;
11273 PyObject *u;
11274 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011276 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011277 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278
11279 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
Antoine Pitrou22425222011-10-04 19:10:51 +020011282 if (PyUnicode_READY(self) == -1)
11283 return NULL;
11284
Thomas Wouters7e474022000-07-16 12:04:32 +000011285 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011286 src_len = PyUnicode_GET_LENGTH(self);
11287 i = j = line_pos = 0;
11288 kind = PyUnicode_KIND(self);
11289 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011290 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011291 for (; i < src_len; i++) {
11292 ch = PyUnicode_READ(kind, src_data, i);
11293 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011294 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011295 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011296 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011297 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011298 goto overflow;
11299 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011300 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011301 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011302 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011304 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011305 goto overflow;
11306 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011308 if (ch == '\n' || ch == '\r')
11309 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011311 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011312 if (!found)
11313 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011314
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011316 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317 if (!u)
11318 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011319 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320
Antoine Pitroue71d5742011-10-04 15:55:09 +020011321 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322
Antoine Pitroue71d5742011-10-04 15:55:09 +020011323 for (; i < src_len; i++) {
11324 ch = PyUnicode_READ(kind, src_data, i);
11325 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011326 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011327 incr = tabsize - (line_pos % tabsize);
11328 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011329 FILL(kind, dest_data, ' ', j, incr);
11330 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011331 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011332 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011333 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011334 line_pos++;
11335 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011336 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011337 if (ch == '\n' || ch == '\r')
11338 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011340 }
11341 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011342 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011343
Antoine Pitroue71d5742011-10-04 15:55:09 +020011344 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011345 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347}
11348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011349PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011350 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351\n\
11352Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011353such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354arguments start and end are interpreted as in slice notation.\n\
11355\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011356Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357
11358static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011361 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011362 Py_ssize_t start;
11363 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011364 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365
Jesus Ceaac451502011-04-20 17:09:23 +020011366 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11367 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011370 if (PyUnicode_READY(self) == -1)
11371 return NULL;
11372 if (PyUnicode_READY(substring) == -1)
11373 return NULL;
11374
Victor Stinner7931d9a2011-11-04 00:22:48 +010011375 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376
11377 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011378
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 if (result == -2)
11380 return NULL;
11381
Christian Heimes217cfd12007-12-02 14:31:20 +000011382 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383}
11384
11385static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011386unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011387{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011388 void *data;
11389 enum PyUnicode_Kind kind;
11390 Py_UCS4 ch;
11391 PyObject *res;
11392
11393 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11394 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011395 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011396 }
11397 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11398 PyErr_SetString(PyExc_IndexError, "string index out of range");
11399 return NULL;
11400 }
11401 kind = PyUnicode_KIND(self);
11402 data = PyUnicode_DATA(self);
11403 ch = PyUnicode_READ(kind, data, index);
11404 if (ch < 256)
11405 return get_latin1_char(ch);
11406
11407 res = PyUnicode_New(1, ch);
11408 if (res == NULL)
11409 return NULL;
11410 kind = PyUnicode_KIND(res);
11411 data = PyUnicode_DATA(res);
11412 PyUnicode_WRITE(kind, data, 0, ch);
11413 assert(_PyUnicode_CheckConsistency(res, 1));
11414 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415}
11416
Guido van Rossumc2504932007-09-18 19:42:40 +000011417/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011418 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011419static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011420unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421{
Guido van Rossumc2504932007-09-18 19:42:40 +000011422 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011423 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011424
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011425#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011426 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011427#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 if (_PyUnicode_HASH(self) != -1)
11429 return _PyUnicode_HASH(self);
11430 if (PyUnicode_READY(self) == -1)
11431 return -1;
11432 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011433 /*
11434 We make the hash of the empty string be 0, rather than using
11435 (prefix ^ suffix), since this slightly obfuscates the hash secret
11436 */
11437 if (len == 0) {
11438 _PyUnicode_HASH(self) = 0;
11439 return 0;
11440 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441
11442 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011443#define HASH(P) \
11444 x ^= (Py_uhash_t) *P << 7; \
11445 while (--len >= 0) \
11446 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447
Georg Brandl2fb477c2012-02-21 00:33:36 +010011448 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011449 switch (PyUnicode_KIND(self)) {
11450 case PyUnicode_1BYTE_KIND: {
11451 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11452 HASH(c);
11453 break;
11454 }
11455 case PyUnicode_2BYTE_KIND: {
11456 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11457 HASH(s);
11458 break;
11459 }
11460 default: {
11461 Py_UCS4 *l;
11462 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11463 "Impossible switch case in unicode_hash");
11464 l = PyUnicode_4BYTE_DATA(self);
11465 HASH(l);
11466 break;
11467 }
11468 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011469 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11470 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471
Guido van Rossumc2504932007-09-18 19:42:40 +000011472 if (x == -1)
11473 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011475 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011479PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011482Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483
11484static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011485unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011487 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011488 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011489 Py_ssize_t start;
11490 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491
Jesus Ceaac451502011-04-20 17:09:23 +020011492 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11493 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011496 if (PyUnicode_READY(self) == -1)
11497 return NULL;
11498 if (PyUnicode_READY(substring) == -1)
11499 return NULL;
11500
Victor Stinner7931d9a2011-11-04 00:22:48 +010011501 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502
11503 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505 if (result == -2)
11506 return NULL;
11507
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508 if (result < 0) {
11509 PyErr_SetString(PyExc_ValueError, "substring not found");
11510 return NULL;
11511 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011512
Christian Heimes217cfd12007-12-02 14:31:20 +000011513 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514}
11515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011516PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011517 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011519Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011520at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521
11522static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011523unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525 Py_ssize_t i, length;
11526 int kind;
11527 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528 int cased;
11529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011530 if (PyUnicode_READY(self) == -1)
11531 return NULL;
11532 length = PyUnicode_GET_LENGTH(self);
11533 kind = PyUnicode_KIND(self);
11534 data = PyUnicode_DATA(self);
11535
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537 if (length == 1)
11538 return PyBool_FromLong(
11539 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011541 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011543 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011544
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011546 for (i = 0; i < length; i++) {
11547 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011548
Benjamin Peterson29060642009-01-31 22:14:21 +000011549 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11550 return PyBool_FromLong(0);
11551 else if (!cased && Py_UNICODE_ISLOWER(ch))
11552 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011554 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555}
11556
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011557PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011560Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011561at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562
11563static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011564unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566 Py_ssize_t i, length;
11567 int kind;
11568 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569 int cased;
11570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 if (PyUnicode_READY(self) == -1)
11572 return NULL;
11573 length = PyUnicode_GET_LENGTH(self);
11574 kind = PyUnicode_KIND(self);
11575 data = PyUnicode_DATA(self);
11576
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011578 if (length == 1)
11579 return PyBool_FromLong(
11580 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011582 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011584 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011585
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011587 for (i = 0; i < length; i++) {
11588 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011589
Benjamin Peterson29060642009-01-31 22:14:21 +000011590 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11591 return PyBool_FromLong(0);
11592 else if (!cased && Py_UNICODE_ISUPPER(ch))
11593 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011595 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596}
11597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011598PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011599 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011601Return True if S is a titlecased string and there is at least one\n\
11602character in S, i.e. upper- and titlecase characters may only\n\
11603follow uncased characters and lowercase characters only cased ones.\n\
11604Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
11606static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011607unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 Py_ssize_t i, length;
11610 int kind;
11611 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612 int cased, previous_is_cased;
11613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 if (PyUnicode_READY(self) == -1)
11615 return NULL;
11616 length = PyUnicode_GET_LENGTH(self);
11617 kind = PyUnicode_KIND(self);
11618 data = PyUnicode_DATA(self);
11619
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 if (length == 1) {
11622 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11623 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11624 (Py_UNICODE_ISUPPER(ch) != 0));
11625 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011627 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011629 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011630
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631 cased = 0;
11632 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633 for (i = 0; i < length; i++) {
11634 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011635
Benjamin Peterson29060642009-01-31 22:14:21 +000011636 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11637 if (previous_is_cased)
11638 return PyBool_FromLong(0);
11639 previous_is_cased = 1;
11640 cased = 1;
11641 }
11642 else if (Py_UNICODE_ISLOWER(ch)) {
11643 if (!previous_is_cased)
11644 return PyBool_FromLong(0);
11645 previous_is_cased = 1;
11646 cased = 1;
11647 }
11648 else
11649 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011651 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652}
11653
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011654PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011655 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011657Return True if all characters in S are whitespace\n\
11658and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659
11660static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011661unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 Py_ssize_t i, length;
11664 int kind;
11665 void *data;
11666
11667 if (PyUnicode_READY(self) == -1)
11668 return NULL;
11669 length = PyUnicode_GET_LENGTH(self);
11670 kind = PyUnicode_KIND(self);
11671 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672
Guido van Rossumd57fd912000-03-10 22:53:23 +000011673 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 if (length == 1)
11675 return PyBool_FromLong(
11676 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011678 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011680 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 for (i = 0; i < length; i++) {
11683 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011684 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011685 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011687 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688}
11689
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011690PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011691 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011692\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011693Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011694and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011695
11696static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011697unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011698{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 Py_ssize_t i, length;
11700 int kind;
11701 void *data;
11702
11703 if (PyUnicode_READY(self) == -1)
11704 return NULL;
11705 length = PyUnicode_GET_LENGTH(self);
11706 kind = PyUnicode_KIND(self);
11707 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011708
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011709 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011710 if (length == 1)
11711 return PyBool_FromLong(
11712 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011713
11714 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011716 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011718 for (i = 0; i < length; i++) {
11719 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011720 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011721 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011722 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011723}
11724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011725PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011726 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011727\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011728Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011729and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011730
11731static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011732unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011733{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 int kind;
11735 void *data;
11736 Py_ssize_t len, i;
11737
11738 if (PyUnicode_READY(self) == -1)
11739 return NULL;
11740
11741 kind = PyUnicode_KIND(self);
11742 data = PyUnicode_DATA(self);
11743 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011744
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011745 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 if (len == 1) {
11747 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11748 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11749 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011750
11751 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011753 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 for (i = 0; i < len; i++) {
11756 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011757 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011758 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011759 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011760 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011761}
11762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011763PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011764 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011766Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011767False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768
11769static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011770unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 Py_ssize_t i, length;
11773 int kind;
11774 void *data;
11775
11776 if (PyUnicode_READY(self) == -1)
11777 return NULL;
11778 length = PyUnicode_GET_LENGTH(self);
11779 kind = PyUnicode_KIND(self);
11780 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011783 if (length == 1)
11784 return PyBool_FromLong(
11785 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011787 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011789 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 for (i = 0; i < length; i++) {
11792 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011793 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011795 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796}
11797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011798PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011799 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011801Return True if all characters in S are digits\n\
11802and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803
11804static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011805unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 Py_ssize_t i, length;
11808 int kind;
11809 void *data;
11810
11811 if (PyUnicode_READY(self) == -1)
11812 return NULL;
11813 length = PyUnicode_GET_LENGTH(self);
11814 kind = PyUnicode_KIND(self);
11815 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 if (length == 1) {
11819 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11820 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011823 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011825 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 for (i = 0; i < length; i++) {
11828 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011829 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011831 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832}
11833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011834PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011835 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011837Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011838False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839
11840static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011841unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 Py_ssize_t i, length;
11844 int kind;
11845 void *data;
11846
11847 if (PyUnicode_READY(self) == -1)
11848 return NULL;
11849 length = PyUnicode_GET_LENGTH(self);
11850 kind = PyUnicode_KIND(self);
11851 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 if (length == 1)
11855 return PyBool_FromLong(
11856 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011858 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011860 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 for (i = 0; i < length; i++) {
11863 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011864 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011866 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867}
11868
Martin v. Löwis47383402007-08-15 07:32:56 +000011869int
11870PyUnicode_IsIdentifier(PyObject *self)
11871{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 int kind;
11873 void *data;
11874 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011875 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 if (PyUnicode_READY(self) == -1) {
11878 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011879 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011880 }
11881
11882 /* Special case for empty strings */
11883 if (PyUnicode_GET_LENGTH(self) == 0)
11884 return 0;
11885 kind = PyUnicode_KIND(self);
11886 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011887
11888 /* PEP 3131 says that the first character must be in
11889 XID_Start and subsequent characters in XID_Continue,
11890 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011891 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011892 letters, digits, underscore). However, given the current
11893 definition of XID_Start and XID_Continue, it is sufficient
11894 to check just for these, except that _ must be allowed
11895 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011897 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011898 return 0;
11899
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011900 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011902 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011903 return 1;
11904}
11905
11906PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011907 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011908\n\
11909Return True if S is a valid identifier according\n\
11910to the language definition.");
11911
11912static PyObject*
11913unicode_isidentifier(PyObject *self)
11914{
11915 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11916}
11917
Georg Brandl559e5d72008-06-11 18:37:52 +000011918PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011919 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011920\n\
11921Return True if all characters in S are considered\n\
11922printable in repr() or S is empty, False otherwise.");
11923
11924static PyObject*
11925unicode_isprintable(PyObject *self)
11926{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 Py_ssize_t i, length;
11928 int kind;
11929 void *data;
11930
11931 if (PyUnicode_READY(self) == -1)
11932 return NULL;
11933 length = PyUnicode_GET_LENGTH(self);
11934 kind = PyUnicode_KIND(self);
11935 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011936
11937 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 if (length == 1)
11939 return PyBool_FromLong(
11940 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 for (i = 0; i < length; i++) {
11943 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011944 Py_RETURN_FALSE;
11945 }
11946 }
11947 Py_RETURN_TRUE;
11948}
11949
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011950PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011951 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952\n\
11953Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011954iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955
11956static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011957unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011959 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960}
11961
Martin v. Löwis18e16552006-02-15 17:27:45 +000011962static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011963unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 if (PyUnicode_READY(self) == -1)
11966 return -1;
11967 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968}
11969
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011970PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011971 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011973Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011974done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975
11976static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011977unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011979 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 Py_UCS4 fillchar = ' ';
11981
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011982 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983 return NULL;
11984
Benjamin Petersonbac79492012-01-14 13:34:47 -050011985 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011986 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987
Victor Stinnerc4b49542011-12-11 22:44:26 +010011988 if (PyUnicode_GET_LENGTH(self) >= width)
11989 return unicode_result_unchanged(self);
11990
11991 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992}
11993
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011994PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011995 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011997Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011998
11999static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012000unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012002 if (PyUnicode_READY(self) == -1)
12003 return NULL;
12004 if (PyUnicode_IS_ASCII(self))
12005 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012006 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007}
12008
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012009#define LEFTSTRIP 0
12010#define RIGHTSTRIP 1
12011#define BOTHSTRIP 2
12012
12013/* Arrays indexed by above */
12014static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
12015
12016#define STRIPNAME(i) (stripformat[i]+3)
12017
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012018/* externally visible for str.strip(unicode) */
12019PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012020_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012021{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 void *data;
12023 int kind;
12024 Py_ssize_t i, j, len;
12025 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12028 return NULL;
12029
12030 kind = PyUnicode_KIND(self);
12031 data = PyUnicode_DATA(self);
12032 len = PyUnicode_GET_LENGTH(self);
12033 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12034 PyUnicode_DATA(sepobj),
12035 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000012036
Benjamin Peterson14339b62009-01-31 16:36:08 +000012037 i = 0;
12038 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 while (i < len &&
12040 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012041 i++;
12042 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012043 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012044
Benjamin Peterson14339b62009-01-31 16:36:08 +000012045 j = len;
12046 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012047 do {
12048 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049 } while (j >= i &&
12050 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000012051 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012052 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012053
Victor Stinner7931d9a2011-11-04 00:22:48 +010012054 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055}
12056
12057PyObject*
12058PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12059{
12060 unsigned char *data;
12061 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012062 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063
Victor Stinnerde636f32011-10-01 03:55:54 +020012064 if (PyUnicode_READY(self) == -1)
12065 return NULL;
12066
Victor Stinner684d5fd2012-05-03 02:32:34 +020012067 length = PyUnicode_GET_LENGTH(self);
12068 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012069
Victor Stinner684d5fd2012-05-03 02:32:34 +020012070 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012071 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072
Victor Stinnerde636f32011-10-01 03:55:54 +020012073 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012074 PyErr_SetString(PyExc_IndexError, "string index out of range");
12075 return NULL;
12076 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020012077 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020012078 Py_INCREF(unicode_empty);
12079 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020012080 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012081
Victor Stinner684d5fd2012-05-03 02:32:34 +020012082 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012083 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012084 data = PyUnicode_1BYTE_DATA(self);
12085 return unicode_fromascii(data + start, length);
12086 }
12087 else {
12088 kind = PyUnicode_KIND(self);
12089 data = PyUnicode_1BYTE_DATA(self);
12090 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012091 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012092 length);
12093 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012094}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095
12096static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012097do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 int kind;
12100 void *data;
12101 Py_ssize_t len, i, j;
12102
12103 if (PyUnicode_READY(self) == -1)
12104 return NULL;
12105
12106 kind = PyUnicode_KIND(self);
12107 data = PyUnicode_DATA(self);
12108 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012109
Benjamin Peterson14339b62009-01-31 16:36:08 +000012110 i = 0;
12111 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012113 i++;
12114 }
12115 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012116
Benjamin Peterson14339b62009-01-31 16:36:08 +000012117 j = len;
12118 if (striptype != LEFTSTRIP) {
12119 do {
12120 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012122 j++;
12123 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012124
Victor Stinner7931d9a2011-11-04 00:22:48 +010012125 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126}
12127
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012128
12129static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012130do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012131{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012132 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012133
Benjamin Peterson14339b62009-01-31 16:36:08 +000012134 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
12135 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012136
Benjamin Peterson14339b62009-01-31 16:36:08 +000012137 if (sep != NULL && sep != Py_None) {
12138 if (PyUnicode_Check(sep))
12139 return _PyUnicode_XStrip(self, striptype, sep);
12140 else {
12141 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012142 "%s arg must be None or str",
12143 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012144 return NULL;
12145 }
12146 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012147
Benjamin Peterson14339b62009-01-31 16:36:08 +000012148 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012149}
12150
12151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012152PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012153 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012154\n\
12155Return a copy of the string S with leading and trailing\n\
12156whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012157If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012158
12159static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012160unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012161{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012162 if (PyTuple_GET_SIZE(args) == 0)
12163 return do_strip(self, BOTHSTRIP); /* Common case */
12164 else
12165 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012166}
12167
12168
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012169PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012170 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012171\n\
12172Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012173If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012174
12175static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012176unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012177{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012178 if (PyTuple_GET_SIZE(args) == 0)
12179 return do_strip(self, LEFTSTRIP); /* Common case */
12180 else
12181 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012182}
12183
12184
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012185PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012186 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012187\n\
12188Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012189If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012190
12191static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012192unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012193{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012194 if (PyTuple_GET_SIZE(args) == 0)
12195 return do_strip(self, RIGHTSTRIP); /* Common case */
12196 else
12197 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012198}
12199
12200
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012202unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012204 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206
Georg Brandl222de0f2009-04-12 12:01:50 +000012207 if (len < 1) {
12208 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020012209 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000012210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211
Victor Stinnerc4b49542011-12-11 22:44:26 +010012212 /* no repeat, return original string */
12213 if (len == 1)
12214 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012215
Benjamin Petersonbac79492012-01-14 13:34:47 -050012216 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217 return NULL;
12218
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012219 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012220 PyErr_SetString(PyExc_OverflowError,
12221 "repeated string is too long");
12222 return NULL;
12223 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012225
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012226 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227 if (!u)
12228 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012229 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 if (PyUnicode_GET_LENGTH(str) == 1) {
12232 const int kind = PyUnicode_KIND(str);
12233 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012234 if (kind == PyUnicode_1BYTE_KIND) {
12235 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012236 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012237 }
12238 else if (kind == PyUnicode_2BYTE_KIND) {
12239 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012240 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012241 ucs2[n] = fill_char;
12242 } else {
12243 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12244 assert(kind == PyUnicode_4BYTE_KIND);
12245 for (n = 0; n < len; ++n)
12246 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248 }
12249 else {
12250 /* number of characters copied this far */
12251 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012252 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012253 char *to = (char *) PyUnicode_DATA(u);
12254 Py_MEMCPY(to, PyUnicode_DATA(str),
12255 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012256 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257 n = (done <= nchars-done) ? done : nchars-done;
12258 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012259 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261 }
12262
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012263 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012264 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265}
12266
Alexander Belopolsky40018472011-02-26 01:02:56 +000012267PyObject *
12268PyUnicode_Replace(PyObject *obj,
12269 PyObject *subobj,
12270 PyObject *replobj,
12271 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272{
12273 PyObject *self;
12274 PyObject *str1;
12275 PyObject *str2;
12276 PyObject *result;
12277
12278 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012279 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012282 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012283 Py_DECREF(self);
12284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285 }
12286 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012287 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012288 Py_DECREF(self);
12289 Py_DECREF(str1);
12290 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012292 if (PyUnicode_READY(self) == -1 ||
12293 PyUnicode_READY(str1) == -1 ||
12294 PyUnicode_READY(str2) == -1)
12295 result = NULL;
12296 else
12297 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012298 Py_DECREF(self);
12299 Py_DECREF(str1);
12300 Py_DECREF(str2);
12301 return result;
12302}
12303
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012304PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012305 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306\n\
12307Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012308old replaced by new. If the optional argument count is\n\
12309given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012310
12311static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012313{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 PyObject *str1;
12315 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012316 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012317 PyObject *result;
12318
Martin v. Löwis18e16552006-02-15 17:27:45 +000012319 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012321 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012322 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012324 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 return NULL;
12326 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012327 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012328 Py_DECREF(str1);
12329 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012330 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012331 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12332 result = NULL;
12333 else
12334 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012335
12336 Py_DECREF(str1);
12337 Py_DECREF(str2);
12338 return result;
12339}
12340
Alexander Belopolsky40018472011-02-26 01:02:56 +000012341static PyObject *
12342unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012344 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 Py_ssize_t isize;
12346 Py_ssize_t osize, squote, dquote, i, o;
12347 Py_UCS4 max, quote;
12348 int ikind, okind;
12349 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012352 return NULL;
12353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 isize = PyUnicode_GET_LENGTH(unicode);
12355 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 /* Compute length of output, quote characters, and
12358 maximum character */
12359 osize = 2; /* quotes */
12360 max = 127;
12361 squote = dquote = 0;
12362 ikind = PyUnicode_KIND(unicode);
12363 for (i = 0; i < isize; i++) {
12364 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12365 switch (ch) {
12366 case '\'': squote++; osize++; break;
12367 case '"': dquote++; osize++; break;
12368 case '\\': case '\t': case '\r': case '\n':
12369 osize += 2; break;
12370 default:
12371 /* Fast-path ASCII */
12372 if (ch < ' ' || ch == 0x7f)
12373 osize += 4; /* \xHH */
12374 else if (ch < 0x7f)
12375 osize++;
12376 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12377 osize++;
12378 max = ch > max ? ch : max;
12379 }
12380 else if (ch < 0x100)
12381 osize += 4; /* \xHH */
12382 else if (ch < 0x10000)
12383 osize += 6; /* \uHHHH */
12384 else
12385 osize += 10; /* \uHHHHHHHH */
12386 }
12387 }
12388
12389 quote = '\'';
12390 if (squote) {
12391 if (dquote)
12392 /* Both squote and dquote present. Use squote,
12393 and escape them */
12394 osize += squote;
12395 else
12396 quote = '"';
12397 }
12398
12399 repr = PyUnicode_New(osize, max);
12400 if (repr == NULL)
12401 return NULL;
12402 okind = PyUnicode_KIND(repr);
12403 odata = PyUnicode_DATA(repr);
12404
12405 PyUnicode_WRITE(okind, odata, 0, quote);
12406 PyUnicode_WRITE(okind, odata, osize-1, quote);
12407
12408 for (i = 0, o = 1; i < isize; i++) {
12409 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012410
12411 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012412 if ((ch == quote) || (ch == '\\')) {
12413 PyUnicode_WRITE(okind, odata, o++, '\\');
12414 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012415 continue;
12416 }
12417
Benjamin Peterson29060642009-01-31 22:14:21 +000012418 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012419 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012420 PyUnicode_WRITE(okind, odata, o++, '\\');
12421 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012422 }
12423 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424 PyUnicode_WRITE(okind, odata, o++, '\\');
12425 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012426 }
12427 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 PyUnicode_WRITE(okind, odata, o++, '\\');
12429 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012430 }
12431
12432 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012433 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012434 PyUnicode_WRITE(okind, odata, o++, '\\');
12435 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012436 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12437 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012438 }
12439
Georg Brandl559e5d72008-06-11 18:37:52 +000012440 /* Copy ASCII characters as-is */
12441 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012442 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012443 }
12444
Benjamin Peterson29060642009-01-31 22:14:21 +000012445 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012446 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012447 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012448 (categories Z* and C* except ASCII space)
12449 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012450 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012451 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012452 if (ch <= 0xff) {
12453 PyUnicode_WRITE(okind, odata, o++, '\\');
12454 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012455 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12456 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012457 }
12458 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 else if (ch >= 0x10000) {
12460 PyUnicode_WRITE(okind, odata, o++, '\\');
12461 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012462 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12463 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12464 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12465 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12466 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12467 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12468 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12469 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012470 }
12471 /* Map 16-bit characters to '\uxxxx' */
12472 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 PyUnicode_WRITE(okind, odata, o++, '\\');
12474 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012475 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12476 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12477 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12478 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012479 }
12480 }
12481 /* Copy characters as-is */
12482 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012484 }
12485 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012486 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012488 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012489 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490}
12491
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012492PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012493 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012494\n\
12495Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012496such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497arguments start and end are interpreted as in slice notation.\n\
12498\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012499Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500
12501static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012502unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012504 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012505 Py_ssize_t start;
12506 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012507 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508
Jesus Ceaac451502011-04-20 17:09:23 +020012509 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12510 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012511 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012513 if (PyUnicode_READY(self) == -1)
12514 return NULL;
12515 if (PyUnicode_READY(substring) == -1)
12516 return NULL;
12517
Victor Stinner7931d9a2011-11-04 00:22:48 +010012518 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519
12520 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012522 if (result == -2)
12523 return NULL;
12524
Christian Heimes217cfd12007-12-02 14:31:20 +000012525 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526}
12527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012528PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012529 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012531Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532
12533static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012536 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012537 Py_ssize_t start;
12538 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012539 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540
Jesus Ceaac451502011-04-20 17:09:23 +020012541 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12542 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012543 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012545 if (PyUnicode_READY(self) == -1)
12546 return NULL;
12547 if (PyUnicode_READY(substring) == -1)
12548 return NULL;
12549
Victor Stinner7931d9a2011-11-04 00:22:48 +010012550 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551
12552 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 if (result == -2)
12555 return NULL;
12556
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557 if (result < 0) {
12558 PyErr_SetString(PyExc_ValueError, "substring not found");
12559 return NULL;
12560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561
Christian Heimes217cfd12007-12-02 14:31:20 +000012562 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563}
12564
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012565PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012566 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012568Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012569done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570
12571static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012572unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012574 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 Py_UCS4 fillchar = ' ';
12576
Victor Stinnere9a29352011-10-01 02:14:59 +020012577 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012579
Benjamin Petersonbac79492012-01-14 13:34:47 -050012580 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581 return NULL;
12582
Victor Stinnerc4b49542011-12-11 22:44:26 +010012583 if (PyUnicode_GET_LENGTH(self) >= width)
12584 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585
Victor Stinnerc4b49542011-12-11 22:44:26 +010012586 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587}
12588
Alexander Belopolsky40018472011-02-26 01:02:56 +000012589PyObject *
12590PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591{
12592 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012593
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594 s = PyUnicode_FromObject(s);
12595 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012596 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012597 if (sep != NULL) {
12598 sep = PyUnicode_FromObject(sep);
12599 if (sep == NULL) {
12600 Py_DECREF(s);
12601 return NULL;
12602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603 }
12604
Victor Stinner9310abb2011-10-05 00:59:23 +020012605 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012606
12607 Py_DECREF(s);
12608 Py_XDECREF(sep);
12609 return result;
12610}
12611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012612PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012613 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614\n\
12615Return a list of the words in S, using sep as the\n\
12616delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012617splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012618whitespace string is a separator and empty strings are\n\
12619removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620
12621static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012622unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012624 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012626 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012628 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12629 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630 return NULL;
12631
12632 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012633 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012635 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012637 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638}
12639
Thomas Wouters477c8d52006-05-27 19:21:47 +000012640PyObject *
12641PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12642{
12643 PyObject* str_obj;
12644 PyObject* sep_obj;
12645 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 int kind1, kind2, kind;
12647 void *buf1 = NULL, *buf2 = NULL;
12648 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012649
12650 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012651 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012652 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012653 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012654 if (!sep_obj) {
12655 Py_DECREF(str_obj);
12656 return NULL;
12657 }
12658 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12659 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012660 Py_DECREF(str_obj);
12661 return NULL;
12662 }
12663
Victor Stinner14f8f022011-10-05 20:58:25 +020012664 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012666 kind = Py_MAX(kind1, kind2);
12667 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012669 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 if (!buf1)
12671 goto onError;
12672 buf2 = PyUnicode_DATA(sep_obj);
12673 if (kind2 != kind)
12674 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12675 if (!buf2)
12676 goto onError;
12677 len1 = PyUnicode_GET_LENGTH(str_obj);
12678 len2 = PyUnicode_GET_LENGTH(sep_obj);
12679
Benjamin Petersonead6b532011-12-20 17:23:42 -060012680 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012682 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12683 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12684 else
12685 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012686 break;
12687 case PyUnicode_2BYTE_KIND:
12688 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12689 break;
12690 case PyUnicode_4BYTE_KIND:
12691 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12692 break;
12693 default:
12694 assert(0);
12695 out = 0;
12696 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012697
12698 Py_DECREF(sep_obj);
12699 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700 if (kind1 != kind)
12701 PyMem_Free(buf1);
12702 if (kind2 != kind)
12703 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012704
12705 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012706 onError:
12707 Py_DECREF(sep_obj);
12708 Py_DECREF(str_obj);
12709 if (kind1 != kind && buf1)
12710 PyMem_Free(buf1);
12711 if (kind2 != kind && buf2)
12712 PyMem_Free(buf2);
12713 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012714}
12715
12716
12717PyObject *
12718PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12719{
12720 PyObject* str_obj;
12721 PyObject* sep_obj;
12722 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723 int kind1, kind2, kind;
12724 void *buf1 = NULL, *buf2 = NULL;
12725 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012726
12727 str_obj = PyUnicode_FromObject(str_in);
12728 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012729 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012730 sep_obj = PyUnicode_FromObject(sep_in);
12731 if (!sep_obj) {
12732 Py_DECREF(str_obj);
12733 return NULL;
12734 }
12735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 kind1 = PyUnicode_KIND(str_in);
12737 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012738 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 buf1 = PyUnicode_DATA(str_in);
12740 if (kind1 != kind)
12741 buf1 = _PyUnicode_AsKind(str_in, kind);
12742 if (!buf1)
12743 goto onError;
12744 buf2 = PyUnicode_DATA(sep_obj);
12745 if (kind2 != kind)
12746 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12747 if (!buf2)
12748 goto onError;
12749 len1 = PyUnicode_GET_LENGTH(str_obj);
12750 len2 = PyUnicode_GET_LENGTH(sep_obj);
12751
Benjamin Petersonead6b532011-12-20 17:23:42 -060012752 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012754 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12755 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12756 else
12757 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758 break;
12759 case PyUnicode_2BYTE_KIND:
12760 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12761 break;
12762 case PyUnicode_4BYTE_KIND:
12763 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12764 break;
12765 default:
12766 assert(0);
12767 out = 0;
12768 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012769
12770 Py_DECREF(sep_obj);
12771 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772 if (kind1 != kind)
12773 PyMem_Free(buf1);
12774 if (kind2 != kind)
12775 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012776
12777 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012778 onError:
12779 Py_DECREF(sep_obj);
12780 Py_DECREF(str_obj);
12781 if (kind1 != kind && buf1)
12782 PyMem_Free(buf1);
12783 if (kind2 != kind && buf2)
12784 PyMem_Free(buf2);
12785 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012786}
12787
12788PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012789 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012790\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012791Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012792the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012793found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012794
12795static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012796unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012797{
Victor Stinner9310abb2011-10-05 00:59:23 +020012798 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012799}
12800
12801PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012802 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012803\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012804Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012805the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012806separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012807
12808static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012809unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012810{
Victor Stinner9310abb2011-10-05 00:59:23 +020012811 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012812}
12813
Alexander Belopolsky40018472011-02-26 01:02:56 +000012814PyObject *
12815PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012816{
12817 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012818
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012819 s = PyUnicode_FromObject(s);
12820 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012821 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012822 if (sep != NULL) {
12823 sep = PyUnicode_FromObject(sep);
12824 if (sep == NULL) {
12825 Py_DECREF(s);
12826 return NULL;
12827 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012828 }
12829
Victor Stinner9310abb2011-10-05 00:59:23 +020012830 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012831
12832 Py_DECREF(s);
12833 Py_XDECREF(sep);
12834 return result;
12835}
12836
12837PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012838 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012839\n\
12840Return a list of the words in S, using sep as the\n\
12841delimiter string, starting at the end of the string and\n\
12842working to the front. If maxsplit is given, at most maxsplit\n\
12843splits are done. If sep is not specified, any whitespace string\n\
12844is a separator.");
12845
12846static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012847unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012848{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012849 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012850 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012851 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012852
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012853 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12854 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012855 return NULL;
12856
12857 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012858 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012859 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012860 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012861 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012862 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012863}
12864
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012865PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012867\n\
12868Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012869Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012870is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012871
12872static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012873unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012875 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012876 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012878 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12879 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880 return NULL;
12881
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012882 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883}
12884
12885static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012886PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012888 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889}
12890
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012891PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012892 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893\n\
12894Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012895and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012896
12897static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012898unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012900 if (PyUnicode_READY(self) == -1)
12901 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012902 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012903}
12904
Georg Brandlceee0772007-11-27 23:48:05 +000012905PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012906 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012907\n\
12908Return a translation table usable for str.translate().\n\
12909If there is only one argument, it must be a dictionary mapping Unicode\n\
12910ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012911Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012912If there are two arguments, they must be strings of equal length, and\n\
12913in the resulting dictionary, each character in x will be mapped to the\n\
12914character at the same position in y. If there is a third argument, it\n\
12915must be a string, whose characters will be mapped to None in the result.");
12916
12917static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012918unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012919{
12920 PyObject *x, *y = NULL, *z = NULL;
12921 PyObject *new = NULL, *key, *value;
12922 Py_ssize_t i = 0;
12923 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012924
Georg Brandlceee0772007-11-27 23:48:05 +000012925 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12926 return NULL;
12927 new = PyDict_New();
12928 if (!new)
12929 return NULL;
12930 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012931 int x_kind, y_kind, z_kind;
12932 void *x_data, *y_data, *z_data;
12933
Georg Brandlceee0772007-11-27 23:48:05 +000012934 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012935 if (!PyUnicode_Check(x)) {
12936 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12937 "be a string if there is a second argument");
12938 goto err;
12939 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012940 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012941 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12942 "arguments must have equal length");
12943 goto err;
12944 }
12945 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012946 x_kind = PyUnicode_KIND(x);
12947 y_kind = PyUnicode_KIND(y);
12948 x_data = PyUnicode_DATA(x);
12949 y_data = PyUnicode_DATA(y);
12950 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12951 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012952 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012953 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012954 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012955 if (!value) {
12956 Py_DECREF(key);
12957 goto err;
12958 }
Georg Brandlceee0772007-11-27 23:48:05 +000012959 res = PyDict_SetItem(new, key, value);
12960 Py_DECREF(key);
12961 Py_DECREF(value);
12962 if (res < 0)
12963 goto err;
12964 }
12965 /* create entries for deleting chars in z */
12966 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012967 z_kind = PyUnicode_KIND(z);
12968 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012969 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012970 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012971 if (!key)
12972 goto err;
12973 res = PyDict_SetItem(new, key, Py_None);
12974 Py_DECREF(key);
12975 if (res < 0)
12976 goto err;
12977 }
12978 }
12979 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012980 int kind;
12981 void *data;
12982
Georg Brandlceee0772007-11-27 23:48:05 +000012983 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012984 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012985 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12986 "to maketrans it must be a dict");
12987 goto err;
12988 }
12989 /* copy entries into the new dict, converting string keys to int keys */
12990 while (PyDict_Next(x, &i, &key, &value)) {
12991 if (PyUnicode_Check(key)) {
12992 /* convert string keys to integer keys */
12993 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012994 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012995 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12996 "table must be of length 1");
12997 goto err;
12998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012999 kind = PyUnicode_KIND(key);
13000 data = PyUnicode_DATA(key);
13001 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013002 if (!newkey)
13003 goto err;
13004 res = PyDict_SetItem(new, newkey, value);
13005 Py_DECREF(newkey);
13006 if (res < 0)
13007 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013008 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013009 /* just keep integer keys */
13010 if (PyDict_SetItem(new, key, value) < 0)
13011 goto err;
13012 } else {
13013 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13014 "be strings or integers");
13015 goto err;
13016 }
13017 }
13018 }
13019 return new;
13020 err:
13021 Py_DECREF(new);
13022 return NULL;
13023}
13024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013025PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013026 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013027\n\
13028Return a copy of the string S, where all characters have been mapped\n\
13029through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000013030Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000013031Unmapped characters are left untouched. Characters mapped to None\n\
13032are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013033
13034static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013035unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013036{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013037 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013038}
13039
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013040PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013041 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013042\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013043Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013044
13045static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013046unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013047{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013048 if (PyUnicode_READY(self) == -1)
13049 return NULL;
13050 if (PyUnicode_IS_ASCII(self))
13051 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013052 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013053}
13054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013055PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013056 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013057\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013058Pad a numeric string S with zeros on the left, to fill a field\n\
13059of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013060
13061static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013062unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013063{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013064 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013065 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013066 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013067 int kind;
13068 void *data;
13069 Py_UCS4 chr;
13070
Martin v. Löwis18e16552006-02-15 17:27:45 +000013071 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013072 return NULL;
13073
Benjamin Petersonbac79492012-01-14 13:34:47 -050013074 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013075 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013076
Victor Stinnerc4b49542011-12-11 22:44:26 +010013077 if (PyUnicode_GET_LENGTH(self) >= width)
13078 return unicode_result_unchanged(self);
13079
13080 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013081
13082 u = pad(self, fill, 0, '0');
13083
Walter Dörwald068325e2002-04-15 13:36:47 +000013084 if (u == NULL)
13085 return NULL;
13086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013087 kind = PyUnicode_KIND(u);
13088 data = PyUnicode_DATA(u);
13089 chr = PyUnicode_READ(kind, data, fill);
13090
13091 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013093 PyUnicode_WRITE(kind, data, 0, chr);
13094 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095 }
13096
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013097 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013098 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013099}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100
13101#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013102static PyObject *
13103unicode__decimal2ascii(PyObject *self)
13104{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013105 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013106}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107#endif
13108
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013109PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013110 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013112Return True if S starts with the specified prefix, False otherwise.\n\
13113With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013114With optional end, stop comparing S at that position.\n\
13115prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116
13117static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013118unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013119 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013121 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013122 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013123 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013124 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013125 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126
Jesus Ceaac451502011-04-20 17:09:23 +020013127 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013128 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013129 if (PyTuple_Check(subobj)) {
13130 Py_ssize_t i;
13131 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013132 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013133 if (substring == NULL)
13134 return NULL;
13135 result = tailmatch(self, substring, start, end, -1);
13136 Py_DECREF(substring);
13137 if (result) {
13138 Py_RETURN_TRUE;
13139 }
13140 }
13141 /* nothing matched */
13142 Py_RETURN_FALSE;
13143 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013144 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013145 if (substring == NULL) {
13146 if (PyErr_ExceptionMatches(PyExc_TypeError))
13147 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13148 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013149 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013150 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013151 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013152 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013153 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013154}
13155
13156
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013157PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013158 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013159\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013160Return True if S ends with the specified suffix, False otherwise.\n\
13161With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013162With optional end, stop comparing S at that position.\n\
13163suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013164
13165static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013166unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013167 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013168{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013169 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013170 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013171 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013172 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013173 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013174
Jesus Ceaac451502011-04-20 17:09:23 +020013175 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013176 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013177 if (PyTuple_Check(subobj)) {
13178 Py_ssize_t i;
13179 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013180 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013181 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013182 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013183 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013184 result = tailmatch(self, substring, start, end, +1);
13185 Py_DECREF(substring);
13186 if (result) {
13187 Py_RETURN_TRUE;
13188 }
13189 }
13190 Py_RETURN_FALSE;
13191 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013192 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013193 if (substring == NULL) {
13194 if (PyErr_ExceptionMatches(PyExc_TypeError))
13195 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13196 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013197 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013198 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013199 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013200 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013201 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202}
13203
Victor Stinner202fdca2012-05-07 12:47:02 +020013204typedef struct {
13205 PyObject *buffer;
13206 void *data;
13207 enum PyUnicode_Kind kind;
13208 Py_UCS4 maxchar;
13209 Py_ssize_t pos;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013210} _PyUnicodeWriter ;
Victor Stinner202fdca2012-05-07 12:47:02 +020013211
13212Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013213_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013214{
13215 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13216 writer->data = PyUnicode_DATA(writer->buffer);
13217 writer->kind = PyUnicode_KIND(writer->buffer);
13218}
13219
13220Py_LOCAL(int)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013221_PyUnicodeWriter_Init(_PyUnicodeWriter *writer,
Victor Stinner202fdca2012-05-07 12:47:02 +020013222 Py_ssize_t length, Py_UCS4 maxchar)
13223{
13224 writer->pos = 0;
13225 writer->buffer = PyUnicode_New(length, maxchar);
13226 if (writer->buffer == NULL)
13227 return -1;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013228 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013229 return 0;
13230}
13231
13232Py_LOCAL_INLINE(int)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013233_PyUnicodeWriter_Prepare(_PyUnicodeWriter *writer,
Victor Stinner202fdca2012-05-07 12:47:02 +020013234 Py_ssize_t length, Py_UCS4 maxchar)
13235{
13236 Py_ssize_t newlen;
13237 PyObject *newbuffer;
13238
13239 if (length > PY_SSIZE_T_MAX - writer->pos) {
13240 PyErr_NoMemory();
13241 return -1;
13242 }
13243 newlen = writer->pos + length;
13244
13245 if (newlen > PyUnicode_GET_LENGTH(writer->buffer)) {
Victor Stinner10680252012-05-07 23:50:05 +020013246 /* overallocate 25% to limit the number of resize */
13247 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
Victor Stinner202fdca2012-05-07 12:47:02 +020013248 newlen += newlen / 4;
13249
13250 if (maxchar > writer->maxchar) {
13251 /* resize + widen */
13252 newbuffer = PyUnicode_New(newlen, maxchar);
13253 if (newbuffer == NULL)
13254 return -1;
13255 PyUnicode_CopyCharacters(newbuffer, 0,
13256 writer->buffer, 0, writer->pos);
13257 Py_DECREF(writer->buffer);
13258 }
13259 else {
13260 newbuffer = resize_compact(writer->buffer, newlen);
13261 if (newbuffer == NULL)
13262 return -1;
13263 }
13264 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013265 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013266 }
13267 else if (maxchar > writer->maxchar) {
13268 if (unicode_widen(&writer->buffer, writer->pos, maxchar) < 0)
13269 return -1;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013270 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013271 }
13272 return 0;
13273}
13274
Victor Stinner202fdca2012-05-07 12:47:02 +020013275Py_LOCAL(PyObject *)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013276_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013277{
13278 if (PyUnicode_Resize(&writer->buffer, writer->pos) < 0) {
13279 Py_DECREF(writer->buffer);
13280 return NULL;
13281 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020013282 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020013283 return writer->buffer;
13284}
13285
13286Py_LOCAL(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013287_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013288{
13289 Py_CLEAR(writer->buffer);
13290}
13291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013292#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013293
13294PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013295 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013296\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013297Return a formatted version of S, using substitutions from args and kwargs.\n\
13298The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013299
Eric Smith27bbca62010-11-04 17:06:58 +000013300PyDoc_STRVAR(format_map__doc__,
13301 "S.format_map(mapping) -> str\n\
13302\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013303Return a formatted version of S, using substitutions from mapping.\n\
13304The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013305
Eric Smith4a7d76d2008-05-30 18:10:19 +000013306static PyObject *
13307unicode__format__(PyObject* self, PyObject* args)
13308{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013309 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013310
13311 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13312 return NULL;
13313
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013314 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013315 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013316 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013317}
13318
Eric Smith8c663262007-08-25 02:26:07 +000013319PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013320 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013321\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013322Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013323
13324static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013325unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013326{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013327 Py_ssize_t size;
13328
13329 /* If it's a compact object, account for base structure +
13330 character data. */
13331 if (PyUnicode_IS_COMPACT_ASCII(v))
13332 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13333 else if (PyUnicode_IS_COMPACT(v))
13334 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013335 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013336 else {
13337 /* If it is a two-block object, account for base object, and
13338 for character block if present. */
13339 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013340 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013341 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013342 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013343 }
13344 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013345 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013346 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013347 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013348 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013349 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013350
13351 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013352}
13353
13354PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013355 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013356
13357static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013358unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013359{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013360 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013361 if (!copy)
13362 return NULL;
13363 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013364}
13365
Guido van Rossumd57fd912000-03-10 22:53:23 +000013366static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013367 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013368 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013369 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13370 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013371 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13372 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013373 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013374 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13375 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13376 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13377 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13378 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013379 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013380 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13381 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13382 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013383 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013384 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13385 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13386 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013387 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013388 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013389 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013390 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013391 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13392 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13393 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13394 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13395 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13396 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13397 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13398 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13399 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13400 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13401 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13402 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13403 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13404 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013405 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013406 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013407 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013408 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013409 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013410 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013411 {"maketrans", (PyCFunction) unicode_maketrans,
13412 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013413 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013414#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013415 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013416 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013417#endif
13418
Benjamin Peterson14339b62009-01-31 16:36:08 +000013419 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013420 {NULL, NULL}
13421};
13422
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013423static PyObject *
13424unicode_mod(PyObject *v, PyObject *w)
13425{
Brian Curtindfc80e32011-08-10 20:28:54 -050013426 if (!PyUnicode_Check(v))
13427 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013428 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013429}
13430
13431static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013432 0, /*nb_add*/
13433 0, /*nb_subtract*/
13434 0, /*nb_multiply*/
13435 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013436};
13437
Guido van Rossumd57fd912000-03-10 22:53:23 +000013438static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013439 (lenfunc) unicode_length, /* sq_length */
13440 PyUnicode_Concat, /* sq_concat */
13441 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13442 (ssizeargfunc) unicode_getitem, /* sq_item */
13443 0, /* sq_slice */
13444 0, /* sq_ass_item */
13445 0, /* sq_ass_slice */
13446 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013447};
13448
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013449static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013450unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013452 if (PyUnicode_READY(self) == -1)
13453 return NULL;
13454
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013455 if (PyIndex_Check(item)) {
13456 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013457 if (i == -1 && PyErr_Occurred())
13458 return NULL;
13459 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013460 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013461 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013462 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013463 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013464 PyObject *result;
13465 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013466 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013467 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013469 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013470 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013471 return NULL;
13472 }
13473
13474 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013475 Py_INCREF(unicode_empty);
13476 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013477 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013478 slicelength == PyUnicode_GET_LENGTH(self)) {
13479 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013480 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013481 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013482 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013483 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013484 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013485 src_kind = PyUnicode_KIND(self);
13486 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013487 if (!PyUnicode_IS_ASCII(self)) {
13488 kind_limit = kind_maxchar_limit(src_kind);
13489 max_char = 0;
13490 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13491 ch = PyUnicode_READ(src_kind, src_data, cur);
13492 if (ch > max_char) {
13493 max_char = ch;
13494 if (max_char >= kind_limit)
13495 break;
13496 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013497 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013498 }
Victor Stinner55c99112011-10-13 01:17:06 +020013499 else
13500 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013501 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013502 if (result == NULL)
13503 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013504 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013505 dest_data = PyUnicode_DATA(result);
13506
13507 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013508 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13509 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013510 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013511 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013512 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013513 } else {
13514 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13515 return NULL;
13516 }
13517}
13518
13519static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013520 (lenfunc)unicode_length, /* mp_length */
13521 (binaryfunc)unicode_subscript, /* mp_subscript */
13522 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013523};
13524
Guido van Rossumd57fd912000-03-10 22:53:23 +000013525
Guido van Rossumd57fd912000-03-10 22:53:23 +000013526/* Helpers for PyUnicode_Format() */
13527
13528static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013529getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013530{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013531 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013532 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013533 (*p_argidx)++;
13534 if (arglen < 0)
13535 return args;
13536 else
13537 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013538 }
13539 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013540 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013541 return NULL;
13542}
13543
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013544/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013545
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013546static PyObject *
13547formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013548{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013549 char *p;
13550 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013551 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013552
Guido van Rossumd57fd912000-03-10 22:53:23 +000013553 x = PyFloat_AsDouble(v);
13554 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013555 return NULL;
13556
Guido van Rossumd57fd912000-03-10 22:53:23 +000013557 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013558 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013559
Eric Smith0923d1d2009-04-16 20:16:10 +000013560 p = PyOS_double_to_string(x, type, prec,
13561 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013562 if (p == NULL)
13563 return NULL;
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013564 result = unicode_fromascii((unsigned char*)p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +000013565 PyMem_Free(p);
13566 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013567}
13568
Victor Stinnerd0880d52012-04-27 23:40:13 +020013569/* formatlong() emulates the format codes d, u, o, x and X, and
13570 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13571 * Python's regular ints.
13572 * Return value: a new PyUnicodeObject*, or NULL if error.
13573 * The output string is of the form
13574 * "-"? ("0x" | "0X")? digit+
13575 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13576 * set in flags. The case of hex digits will be correct,
13577 * There will be at least prec digits, zero-filled on the left if
13578 * necessary to get that many.
13579 * val object to be converted
13580 * flags bitmask of format flags; only F_ALT is looked at
13581 * prec minimum number of digits; 0-fill on left if needed
13582 * type a character in [duoxX]; u acts the same as d
13583 *
13584 * CAUTION: o, x and X conversions on regular ints can never
13585 * produce a '-' sign, but can for Python's unbounded ints.
13586 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013587static PyObject*
13588formatlong(PyObject *val, int flags, int prec, int type)
13589{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013590 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013591 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013592 Py_ssize_t i;
13593 int sign; /* 1 if '-', else 0 */
13594 int len; /* number of characters */
13595 Py_ssize_t llen;
13596 int numdigits; /* len == numnondigits + numdigits */
13597 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013598
Victor Stinnerd0880d52012-04-27 23:40:13 +020013599 /* Avoid exceeding SSIZE_T_MAX */
13600 if (prec > INT_MAX-3) {
13601 PyErr_SetString(PyExc_OverflowError,
13602 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013603 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013604 }
13605
13606 assert(PyLong_Check(val));
13607
13608 switch (type) {
13609 case 'd':
13610 case 'u':
13611 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013612 if (PyBool_Check(val))
13613 result = PyNumber_ToBase(val, 10);
13614 else
13615 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013616 break;
13617 case 'o':
13618 numnondigits = 2;
13619 result = PyNumber_ToBase(val, 8);
13620 break;
13621 case 'x':
13622 case 'X':
13623 numnondigits = 2;
13624 result = PyNumber_ToBase(val, 16);
13625 break;
13626 default:
13627 assert(!"'type' not in [duoxX]");
13628 }
13629 if (!result)
13630 return NULL;
13631
13632 assert(unicode_modifiable(result));
13633 assert(PyUnicode_IS_READY(result));
13634 assert(PyUnicode_IS_ASCII(result));
13635
13636 /* To modify the string in-place, there can only be one reference. */
13637 if (Py_REFCNT(result) != 1) {
13638 PyErr_BadInternalCall();
13639 return NULL;
13640 }
13641 buf = PyUnicode_DATA(result);
13642 llen = PyUnicode_GET_LENGTH(result);
13643 if (llen > INT_MAX) {
13644 PyErr_SetString(PyExc_ValueError,
13645 "string too large in _PyBytes_FormatLong");
13646 return NULL;
13647 }
13648 len = (int)llen;
13649 sign = buf[0] == '-';
13650 numnondigits += sign;
13651 numdigits = len - numnondigits;
13652 assert(numdigits > 0);
13653
13654 /* Get rid of base marker unless F_ALT */
13655 if (((flags & F_ALT) == 0 &&
13656 (type == 'o' || type == 'x' || type == 'X'))) {
13657 assert(buf[sign] == '0');
13658 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13659 buf[sign+1] == 'o');
13660 numnondigits -= 2;
13661 buf += 2;
13662 len -= 2;
13663 if (sign)
13664 buf[0] = '-';
13665 assert(len == numnondigits + numdigits);
13666 assert(numdigits > 0);
13667 }
13668
13669 /* Fill with leading zeroes to meet minimum width. */
13670 if (prec > numdigits) {
13671 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13672 numnondigits + prec);
13673 char *b1;
13674 if (!r1) {
13675 Py_DECREF(result);
13676 return NULL;
13677 }
13678 b1 = PyBytes_AS_STRING(r1);
13679 for (i = 0; i < numnondigits; ++i)
13680 *b1++ = *buf++;
13681 for (i = 0; i < prec - numdigits; i++)
13682 *b1++ = '0';
13683 for (i = 0; i < numdigits; i++)
13684 *b1++ = *buf++;
13685 *b1 = '\0';
13686 Py_DECREF(result);
13687 result = r1;
13688 buf = PyBytes_AS_STRING(result);
13689 len = numnondigits + prec;
13690 }
13691
13692 /* Fix up case for hex conversions. */
13693 if (type == 'X') {
13694 /* Need to convert all lower case letters to upper case.
13695 and need to convert 0x to 0X (and -0x to -0X). */
13696 for (i = 0; i < len; i++)
13697 if (buf[i] >= 'a' && buf[i] <= 'x')
13698 buf[i] -= 'a'-'A';
13699 }
13700 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13701 PyObject *unicode;
13702 unicode = unicode_fromascii((unsigned char *)buf, len);
13703 Py_DECREF(result);
13704 result = unicode;
13705 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013706 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013707}
13708
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013709static Py_UCS4
13710formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013711{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013712 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013713 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013714 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013715 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013716 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013717 goto onError;
13718 }
13719 else {
13720 /* Integer input truncated to a character */
13721 long x;
13722 x = PyLong_AsLong(v);
13723 if (x == -1 && PyErr_Occurred())
13724 goto onError;
13725
Victor Stinner8faf8212011-12-08 22:14:11 +010013726 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013727 PyErr_SetString(PyExc_OverflowError,
13728 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013729 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013730 }
13731
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013732 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013733 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013734
Benjamin Peterson29060642009-01-31 22:14:21 +000013735 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013736 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013737 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013738 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013739}
13740
Alexander Belopolsky40018472011-02-26 01:02:56 +000013741PyObject *
13742PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013743{
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013744 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013745 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013746 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013747 PyObject *temp = NULL;
13748 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013749 PyObject *uformat;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013750 void *fmt;
13751 enum PyUnicode_Kind kind, fmtkind;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013752 _PyUnicodeWriter writer;
Victor Stinneree4544c2012-05-09 22:24:08 +020013753 Py_ssize_t sublen;
13754 Py_UCS4 maxchar;
Tim Petersced69f82003-09-16 20:30:58 +000013755
Guido van Rossumd57fd912000-03-10 22:53:23 +000013756 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013757 PyErr_BadInternalCall();
13758 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013759 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013760 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013761 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013762 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013763 if (PyUnicode_READY(uformat) == -1)
13764 Py_DECREF(uformat);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013766 fmt = PyUnicode_DATA(uformat);
13767 fmtkind = PyUnicode_KIND(uformat);
13768 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13769 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013770
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013771 if (_PyUnicodeWriter_Init(&writer, fmtcnt + 100, 127) < 0)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013772 goto onError;
13773
Guido van Rossumd57fd912000-03-10 22:53:23 +000013774 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013775 arglen = PyTuple_Size(args);
13776 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013777 }
13778 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013779 arglen = -1;
13780 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013781 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013782 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013783 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013784 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013785
13786 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013787 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013788 Py_ssize_t nonfmtpos;
13789 nonfmtpos = fmtpos++;
13790 while (fmtcnt >= 0 &&
13791 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13792 fmtpos++;
13793 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013794 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013795 if (fmtcnt < 0)
13796 fmtpos--;
Victor Stinneree4544c2012-05-09 22:24:08 +020013797 sublen = fmtpos - nonfmtpos;
13798 maxchar = _PyUnicode_FindMaxChar(uformat,
13799 nonfmtpos, nonfmtpos + sublen);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013800 if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013801 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013802
13803 copy_characters(writer.buffer, writer.pos,
13804 uformat, nonfmtpos, sublen);
13805 writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013806 }
13807 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013808 /* Got a format specifier */
13809 int flags = 0;
13810 Py_ssize_t width = -1;
13811 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013812 Py_UCS4 c = '\0';
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013813 Py_UCS4 fill;
13814 int sign;
13815 Py_UCS4 signchar;
Benjamin Peterson29060642009-01-31 22:14:21 +000013816 int isnumok;
13817 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013818 void *pbuf = NULL;
13819 Py_ssize_t pindex, len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013820 Py_UCS4 bufmaxchar;
13821 Py_ssize_t buflen;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013823 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013824 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13825 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013826 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013827 Py_ssize_t keylen;
13828 PyObject *key;
13829 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013830
Benjamin Peterson29060642009-01-31 22:14:21 +000013831 if (dict == NULL) {
13832 PyErr_SetString(PyExc_TypeError,
13833 "format requires a mapping");
13834 goto onError;
13835 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013836 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013837 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013838 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013839 /* Skip over balanced parentheses */
13840 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013841 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13842 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013843 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013844 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013845 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013846 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013847 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013848 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013849 if (fmtcnt < 0 || pcount > 0) {
13850 PyErr_SetString(PyExc_ValueError,
13851 "incomplete format key");
13852 goto onError;
13853 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013854 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013855 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013856 if (key == NULL)
13857 goto onError;
13858 if (args_owned) {
13859 Py_DECREF(args);
13860 args_owned = 0;
13861 }
13862 args = PyObject_GetItem(dict, key);
13863 Py_DECREF(key);
13864 if (args == NULL) {
13865 goto onError;
13866 }
13867 args_owned = 1;
13868 arglen = -1;
13869 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013870 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013871 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013872 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13873 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013874 case '-': flags |= F_LJUST; continue;
13875 case '+': flags |= F_SIGN; continue;
13876 case ' ': flags |= F_BLANK; continue;
13877 case '#': flags |= F_ALT; continue;
13878 case '0': flags |= F_ZERO; continue;
13879 }
13880 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013881 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013882 if (c == '*') {
13883 v = getnextarg(args, arglen, &argidx);
13884 if (v == NULL)
13885 goto onError;
13886 if (!PyLong_Check(v)) {
13887 PyErr_SetString(PyExc_TypeError,
13888 "* wants int");
13889 goto onError;
13890 }
13891 width = PyLong_AsLong(v);
13892 if (width == -1 && PyErr_Occurred())
13893 goto onError;
13894 if (width < 0) {
13895 flags |= F_LJUST;
13896 width = -width;
13897 }
13898 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013899 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013900 }
13901 else if (c >= '0' && c <= '9') {
13902 width = c - '0';
13903 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013904 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013905 if (c < '0' || c > '9')
13906 break;
Mark Dickinson99e2e552012-05-07 11:20:50 +010013907 if (width > (PY_SSIZE_T_MAX - (c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013908 PyErr_SetString(PyExc_ValueError,
13909 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013910 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013911 }
13912 width = width*10 + (c - '0');
13913 }
13914 }
13915 if (c == '.') {
13916 prec = 0;
13917 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013918 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013919 if (c == '*') {
13920 v = getnextarg(args, arglen, &argidx);
13921 if (v == NULL)
13922 goto onError;
13923 if (!PyLong_Check(v)) {
13924 PyErr_SetString(PyExc_TypeError,
13925 "* wants int");
13926 goto onError;
13927 }
13928 prec = PyLong_AsLong(v);
13929 if (prec == -1 && PyErr_Occurred())
13930 goto onError;
13931 if (prec < 0)
13932 prec = 0;
13933 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013934 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013935 }
13936 else if (c >= '0' && c <= '9') {
13937 prec = c - '0';
13938 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013939 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013940 if (c < '0' || c > '9')
13941 break;
Mark Dickinson99e2e552012-05-07 11:20:50 +010013942 if (prec > (INT_MAX - (c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013943 PyErr_SetString(PyExc_ValueError,
13944 "prec too big");
13945 goto onError;
13946 }
13947 prec = prec*10 + (c - '0');
13948 }
13949 }
13950 } /* prec */
13951 if (fmtcnt >= 0) {
13952 if (c == 'h' || c == 'l' || c == 'L') {
13953 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013954 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013955 }
13956 }
13957 if (fmtcnt < 0) {
13958 PyErr_SetString(PyExc_ValueError,
13959 "incomplete format");
13960 goto onError;
13961 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013962
13963 if (c == '%') {
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013964 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013965 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013966 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13967 writer.pos += 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013968 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013969 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013970
13971
13972 v = getnextarg(args, arglen, &argidx);
13973 if (v == NULL)
13974 goto onError;
13975
Benjamin Peterson29060642009-01-31 22:14:21 +000013976 sign = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013977 signchar = '\0';
Benjamin Peterson29060642009-01-31 22:14:21 +000013978 fill = ' ';
13979 switch (c) {
13980
Benjamin Peterson29060642009-01-31 22:14:21 +000013981 case 's':
13982 case 'r':
13983 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013984 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013985 temp = v;
13986 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013987 }
13988 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013989 if (c == 's')
13990 temp = PyObject_Str(v);
13991 else if (c == 'r')
13992 temp = PyObject_Repr(v);
13993 else
13994 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013995 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013996 break;
13997
13998 case 'i':
13999 case 'd':
14000 case 'u':
14001 case 'o':
14002 case 'x':
14003 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000014004 isnumok = 0;
14005 if (PyNumber_Check(v)) {
14006 PyObject *iobj=NULL;
14007
14008 if (PyLong_Check(v)) {
14009 iobj = v;
14010 Py_INCREF(iobj);
14011 }
14012 else {
14013 iobj = PyNumber_Long(v);
14014 }
14015 if (iobj!=NULL) {
14016 if (PyLong_Check(iobj)) {
14017 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020014018 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070014019 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000014020 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000014021 }
14022 else {
14023 Py_DECREF(iobj);
14024 }
14025 }
14026 }
14027 if (!isnumok) {
14028 PyErr_Format(PyExc_TypeError,
14029 "%%%c format: a number is required, "
14030 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
14031 goto onError;
14032 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014033 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000014034 fill = '0';
14035 break;
14036
14037 case 'e':
14038 case 'E':
14039 case 'f':
14040 case 'F':
14041 case 'g':
14042 case 'G':
Benjamin Peterson29060642009-01-31 22:14:21 +000014043 sign = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014044 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000014045 fill = '0';
Victor Stinneraff3cc62012-04-30 05:19:21 +020014046 temp = formatfloat(v, flags, prec, c);
Benjamin Peterson29060642009-01-31 22:14:21 +000014047 break;
14048
14049 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014050 {
14051 Py_UCS4 ch = formatchar(v);
14052 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014053 goto onError;
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020014054 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000014055 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014056 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014057
14058 default:
14059 PyErr_Format(PyExc_ValueError,
14060 "unsupported format character '%c' (0x%x) "
14061 "at index %zd",
14062 (31<=c && c<=126) ? (char)c : '?',
14063 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014064 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000014065 goto onError;
14066 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014067 if (temp == NULL)
14068 goto onError;
14069 assert (PyUnicode_Check(temp));
14070 if (PyUnicode_READY(temp) == -1) {
14071 Py_CLEAR(temp);
14072 goto onError;
14073 }
14074 kind = PyUnicode_KIND(temp);
14075 pbuf = PyUnicode_DATA(temp);
14076 len = PyUnicode_GET_LENGTH(temp);
14077
14078 if (c == 's' || c == 'r' || c == 'a') {
14079 if (prec >= 0 && len > prec)
14080 len = prec;
14081 }
14082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014083 /* pbuf is initialized here. */
14084 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000014085 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014086 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14087 if (ch == '-' || ch == '+') {
14088 signchar = ch;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014089 len--;
14090 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000014091 }
14092 else if (flags & F_SIGN)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014093 signchar = '+';
Benjamin Peterson29060642009-01-31 22:14:21 +000014094 else if (flags & F_BLANK)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014095 signchar = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +000014096 else
14097 sign = 0;
14098 }
14099 if (width < len)
14100 width = len;
Victor Stinneree4544c2012-05-09 22:24:08 +020014101
14102 /* Compute the length and maximum character of the
14103 written characters */
14104 bufmaxchar = 127;
14105 if (!(flags & F_LJUST)) {
14106 if (sign) {
14107 if ((width-1) > len)
14108 bufmaxchar = Py_MAX(bufmaxchar, fill);
14109 }
14110 else {
14111 if (width > len)
14112 bufmaxchar = Py_MAX(bufmaxchar, fill);
14113 }
14114 }
14115 maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
14116 bufmaxchar = Py_MAX(bufmaxchar, maxchar);
14117
14118 buflen = width;
14119 if (sign && len == width)
14120 buflen++;
14121
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014122 if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
Victor Stinneree4544c2012-05-09 22:24:08 +020014123 goto onError;
14124
14125 /* Write characters */
Benjamin Peterson29060642009-01-31 22:14:21 +000014126 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014127 if (fill != ' ') {
Victor Stinneree4544c2012-05-09 22:24:08 +020014128 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
14129 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014130 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014131 if (width > len)
14132 width--;
14133 }
14134 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014135 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014136 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000014137 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014138 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
14139 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
14140 writer.pos += 2;
14141 pindex += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +000014142 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014143 width -= 2;
14144 if (width < 0)
14145 width = 0;
14146 len -= 2;
14147 }
14148 if (width > len && !(flags & F_LJUST)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014149 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014150 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
14151 writer.pos += sublen;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020014152 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000014153 }
14154 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014155 if (sign) {
Victor Stinneree4544c2012-05-09 22:24:08 +020014156 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
14157 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014158 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014159 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014160 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14161 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014162 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
14163 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
14164 writer.pos += 2;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014165 pindex += 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014166 }
14167 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014168
Victor Stinneree4544c2012-05-09 22:24:08 +020014169 copy_characters(writer.buffer, writer.pos,
14170 temp, pindex, len);
14171 writer.pos += len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014172 if (width > len) {
Victor Stinneree4544c2012-05-09 22:24:08 +020014173 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014174 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
14175 writer.pos += sublen;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014176 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014177
Benjamin Peterson29060642009-01-31 22:14:21 +000014178 if (dict && (argidx < arglen) && c != '%') {
14179 PyErr_SetString(PyExc_TypeError,
14180 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000014181 goto onError;
14182 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014183 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000014184 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014185 } /* until end */
14186 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014187 PyErr_SetString(PyExc_TypeError,
14188 "not all arguments converted during string formatting");
14189 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014190 }
14191
14192 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014193 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014194 }
14195 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014196 Py_XDECREF(temp);
14197 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014198 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014199
Benjamin Peterson29060642009-01-31 22:14:21 +000014200 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014201 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014202 Py_XDECREF(temp);
14203 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014204 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014205 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014206 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014207 }
14208 return NULL;
14209}
14210
Jeremy Hylton938ace62002-07-17 16:30:39 +000014211static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014212unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14213
Tim Peters6d6c1a32001-08-02 04:15:00 +000014214static PyObject *
14215unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14216{
Benjamin Peterson29060642009-01-31 22:14:21 +000014217 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014218 static char *kwlist[] = {"object", "encoding", "errors", 0};
14219 char *encoding = NULL;
14220 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014221
Benjamin Peterson14339b62009-01-31 16:36:08 +000014222 if (type != &PyUnicode_Type)
14223 return unicode_subtype_new(type, args, kwds);
14224 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014225 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014226 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014227 if (x == NULL) {
14228 Py_INCREF(unicode_empty);
14229 return unicode_empty;
14230 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014231 if (encoding == NULL && errors == NULL)
14232 return PyObject_Str(x);
14233 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014234 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014235}
14236
Guido van Rossume023fe02001-08-30 03:12:59 +000014237static PyObject *
14238unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14239{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014240 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014241 Py_ssize_t length, char_size;
14242 int share_wstr, share_utf8;
14243 unsigned int kind;
14244 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014245
Benjamin Peterson14339b62009-01-31 16:36:08 +000014246 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014247
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014248 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014249 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014250 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014251 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014252 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014253 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014254 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014255 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014256
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014257 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014258 if (self == NULL) {
14259 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014260 return NULL;
14261 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014262 kind = PyUnicode_KIND(unicode);
14263 length = PyUnicode_GET_LENGTH(unicode);
14264
14265 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014266#ifdef Py_DEBUG
14267 _PyUnicode_HASH(self) = -1;
14268#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014269 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014270#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014271 _PyUnicode_STATE(self).interned = 0;
14272 _PyUnicode_STATE(self).kind = kind;
14273 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014274 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014275 _PyUnicode_STATE(self).ready = 1;
14276 _PyUnicode_WSTR(self) = NULL;
14277 _PyUnicode_UTF8_LENGTH(self) = 0;
14278 _PyUnicode_UTF8(self) = NULL;
14279 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014280 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014281
14282 share_utf8 = 0;
14283 share_wstr = 0;
14284 if (kind == PyUnicode_1BYTE_KIND) {
14285 char_size = 1;
14286 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14287 share_utf8 = 1;
14288 }
14289 else if (kind == PyUnicode_2BYTE_KIND) {
14290 char_size = 2;
14291 if (sizeof(wchar_t) == 2)
14292 share_wstr = 1;
14293 }
14294 else {
14295 assert(kind == PyUnicode_4BYTE_KIND);
14296 char_size = 4;
14297 if (sizeof(wchar_t) == 4)
14298 share_wstr = 1;
14299 }
14300
14301 /* Ensure we won't overflow the length. */
14302 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14303 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014304 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014305 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014306 data = PyObject_MALLOC((length + 1) * char_size);
14307 if (data == NULL) {
14308 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014309 goto onError;
14310 }
14311
Victor Stinnerc3c74152011-10-02 20:39:55 +020014312 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014313 if (share_utf8) {
14314 _PyUnicode_UTF8_LENGTH(self) = length;
14315 _PyUnicode_UTF8(self) = data;
14316 }
14317 if (share_wstr) {
14318 _PyUnicode_WSTR_LENGTH(self) = length;
14319 _PyUnicode_WSTR(self) = (wchar_t *)data;
14320 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014321
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014322 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014323 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014324 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014325#ifdef Py_DEBUG
14326 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14327#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014328 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014329 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014330
14331onError:
14332 Py_DECREF(unicode);
14333 Py_DECREF(self);
14334 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014335}
14336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014337PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000014338 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014339\n\
Collin Winterd474ce82007-08-07 19:42:11 +000014340Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000014341encoding defaults to the current default string encoding.\n\
14342errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014343
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014344static PyObject *unicode_iter(PyObject *seq);
14345
Guido van Rossumd57fd912000-03-10 22:53:23 +000014346PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014347 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014348 "str", /* tp_name */
14349 sizeof(PyUnicodeObject), /* tp_size */
14350 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014351 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014352 (destructor)unicode_dealloc, /* tp_dealloc */
14353 0, /* tp_print */
14354 0, /* tp_getattr */
14355 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014356 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014357 unicode_repr, /* tp_repr */
14358 &unicode_as_number, /* tp_as_number */
14359 &unicode_as_sequence, /* tp_as_sequence */
14360 &unicode_as_mapping, /* tp_as_mapping */
14361 (hashfunc) unicode_hash, /* tp_hash*/
14362 0, /* tp_call*/
14363 (reprfunc) unicode_str, /* tp_str */
14364 PyObject_GenericGetAttr, /* tp_getattro */
14365 0, /* tp_setattro */
14366 0, /* tp_as_buffer */
14367 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014368 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014369 unicode_doc, /* tp_doc */
14370 0, /* tp_traverse */
14371 0, /* tp_clear */
14372 PyUnicode_RichCompare, /* tp_richcompare */
14373 0, /* tp_weaklistoffset */
14374 unicode_iter, /* tp_iter */
14375 0, /* tp_iternext */
14376 unicode_methods, /* tp_methods */
14377 0, /* tp_members */
14378 0, /* tp_getset */
14379 &PyBaseObject_Type, /* tp_base */
14380 0, /* tp_dict */
14381 0, /* tp_descr_get */
14382 0, /* tp_descr_set */
14383 0, /* tp_dictoffset */
14384 0, /* tp_init */
14385 0, /* tp_alloc */
14386 unicode_new, /* tp_new */
14387 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014388};
14389
14390/* Initialize the Unicode implementation */
14391
Victor Stinner3a50e702011-10-18 21:21:00 +020014392int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014393{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014394 int i;
14395
Thomas Wouters477c8d52006-05-27 19:21:47 +000014396 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014397 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014398 0x000A, /* LINE FEED */
14399 0x000D, /* CARRIAGE RETURN */
14400 0x001C, /* FILE SEPARATOR */
14401 0x001D, /* GROUP SEPARATOR */
14402 0x001E, /* RECORD SEPARATOR */
14403 0x0085, /* NEXT LINE */
14404 0x2028, /* LINE SEPARATOR */
14405 0x2029, /* PARAGRAPH SEPARATOR */
14406 };
14407
Fred Drakee4315f52000-05-09 19:53:39 +000014408 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014409 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014410 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014411 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014412 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014413
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014414 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014415 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014416 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014417 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014418
14419 /* initialize the linebreak bloom filter */
14420 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014421 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014422 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014423
14424 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014425
14426#ifdef HAVE_MBCS
14427 winver.dwOSVersionInfoSize = sizeof(winver);
14428 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14429 PyErr_SetFromWindowsErr(0);
14430 return -1;
14431 }
14432#endif
14433 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014434}
14435
14436/* Finalize the Unicode implementation */
14437
Christian Heimesa156e092008-02-16 07:38:31 +000014438int
14439PyUnicode_ClearFreeList(void)
14440{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014441 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014442}
14443
Guido van Rossumd57fd912000-03-10 22:53:23 +000014444void
Thomas Wouters78890102000-07-22 19:25:51 +000014445_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014446{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014447 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014448
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014449 Py_XDECREF(unicode_empty);
14450 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014451
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014452 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014453 if (unicode_latin1[i]) {
14454 Py_DECREF(unicode_latin1[i]);
14455 unicode_latin1[i] = NULL;
14456 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014457 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014458 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014459 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014460}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014461
Walter Dörwald16807132007-05-25 13:52:07 +000014462void
14463PyUnicode_InternInPlace(PyObject **p)
14464{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014465 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014466 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014467#ifdef Py_DEBUG
14468 assert(s != NULL);
14469 assert(_PyUnicode_CHECK(s));
14470#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014471 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014472 return;
14473#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014474 /* If it's a subclass, we don't really know what putting
14475 it in the interned dict might do. */
14476 if (!PyUnicode_CheckExact(s))
14477 return;
14478 if (PyUnicode_CHECK_INTERNED(s))
14479 return;
14480 if (interned == NULL) {
14481 interned = PyDict_New();
14482 if (interned == NULL) {
14483 PyErr_Clear(); /* Don't leave an exception */
14484 return;
14485 }
14486 }
14487 /* It might be that the GetItem call fails even
14488 though the key is present in the dictionary,
14489 namely when this happens during a stack overflow. */
14490 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014491 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014492 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014493
Benjamin Peterson29060642009-01-31 22:14:21 +000014494 if (t) {
14495 Py_INCREF(t);
14496 Py_DECREF(*p);
14497 *p = t;
14498 return;
14499 }
Walter Dörwald16807132007-05-25 13:52:07 +000014500
Benjamin Peterson14339b62009-01-31 16:36:08 +000014501 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014502 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014503 PyErr_Clear();
14504 PyThreadState_GET()->recursion_critical = 0;
14505 return;
14506 }
14507 PyThreadState_GET()->recursion_critical = 0;
14508 /* The two references in interned are not counted by refcnt.
14509 The deallocator will take care of this */
14510 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014511 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014512}
14513
14514void
14515PyUnicode_InternImmortal(PyObject **p)
14516{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014517 PyUnicode_InternInPlace(p);
14518 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014519 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014520 Py_INCREF(*p);
14521 }
Walter Dörwald16807132007-05-25 13:52:07 +000014522}
14523
14524PyObject *
14525PyUnicode_InternFromString(const char *cp)
14526{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014527 PyObject *s = PyUnicode_FromString(cp);
14528 if (s == NULL)
14529 return NULL;
14530 PyUnicode_InternInPlace(&s);
14531 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014532}
14533
Alexander Belopolsky40018472011-02-26 01:02:56 +000014534void
14535_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014536{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014537 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014538 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014539 Py_ssize_t i, n;
14540 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014541
Benjamin Peterson14339b62009-01-31 16:36:08 +000014542 if (interned == NULL || !PyDict_Check(interned))
14543 return;
14544 keys = PyDict_Keys(interned);
14545 if (keys == NULL || !PyList_Check(keys)) {
14546 PyErr_Clear();
14547 return;
14548 }
Walter Dörwald16807132007-05-25 13:52:07 +000014549
Benjamin Peterson14339b62009-01-31 16:36:08 +000014550 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14551 detector, interned unicode strings are not forcibly deallocated;
14552 rather, we give them their stolen references back, and then clear
14553 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014554
Benjamin Peterson14339b62009-01-31 16:36:08 +000014555 n = PyList_GET_SIZE(keys);
14556 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014557 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014558 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014559 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014560 if (PyUnicode_READY(s) == -1) {
14561 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014562 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014564 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014565 case SSTATE_NOT_INTERNED:
14566 /* XXX Shouldn't happen */
14567 break;
14568 case SSTATE_INTERNED_IMMORTAL:
14569 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014570 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014571 break;
14572 case SSTATE_INTERNED_MORTAL:
14573 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014574 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014575 break;
14576 default:
14577 Py_FatalError("Inconsistent interned string state.");
14578 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014579 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014580 }
14581 fprintf(stderr, "total size of all interned strings: "
14582 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14583 "mortal/immortal\n", mortal_size, immortal_size);
14584 Py_DECREF(keys);
14585 PyDict_Clear(interned);
14586 Py_DECREF(interned);
14587 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014588}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014589
14590
14591/********************* Unicode Iterator **************************/
14592
14593typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014594 PyObject_HEAD
14595 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014596 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014597} unicodeiterobject;
14598
14599static void
14600unicodeiter_dealloc(unicodeiterobject *it)
14601{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014602 _PyObject_GC_UNTRACK(it);
14603 Py_XDECREF(it->it_seq);
14604 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014605}
14606
14607static int
14608unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14609{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014610 Py_VISIT(it->it_seq);
14611 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014612}
14613
14614static PyObject *
14615unicodeiter_next(unicodeiterobject *it)
14616{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014617 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014618
Benjamin Peterson14339b62009-01-31 16:36:08 +000014619 assert(it != NULL);
14620 seq = it->it_seq;
14621 if (seq == NULL)
14622 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014623 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014625 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14626 int kind = PyUnicode_KIND(seq);
14627 void *data = PyUnicode_DATA(seq);
14628 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14629 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014630 if (item != NULL)
14631 ++it->it_index;
14632 return item;
14633 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014634
Benjamin Peterson14339b62009-01-31 16:36:08 +000014635 Py_DECREF(seq);
14636 it->it_seq = NULL;
14637 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014638}
14639
14640static PyObject *
14641unicodeiter_len(unicodeiterobject *it)
14642{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014643 Py_ssize_t len = 0;
14644 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014645 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014646 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014647}
14648
14649PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14650
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014651static PyObject *
14652unicodeiter_reduce(unicodeiterobject *it)
14653{
14654 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014655 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014656 it->it_seq, it->it_index);
14657 } else {
14658 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14659 if (u == NULL)
14660 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014661 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014662 }
14663}
14664
14665PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14666
14667static PyObject *
14668unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14669{
14670 Py_ssize_t index = PyLong_AsSsize_t(state);
14671 if (index == -1 && PyErr_Occurred())
14672 return NULL;
14673 if (index < 0)
14674 index = 0;
14675 it->it_index = index;
14676 Py_RETURN_NONE;
14677}
14678
14679PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14680
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014681static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014682 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014683 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014684 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14685 reduce_doc},
14686 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14687 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014688 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014689};
14690
14691PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014692 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14693 "str_iterator", /* tp_name */
14694 sizeof(unicodeiterobject), /* tp_basicsize */
14695 0, /* tp_itemsize */
14696 /* methods */
14697 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14698 0, /* tp_print */
14699 0, /* tp_getattr */
14700 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014701 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014702 0, /* tp_repr */
14703 0, /* tp_as_number */
14704 0, /* tp_as_sequence */
14705 0, /* tp_as_mapping */
14706 0, /* tp_hash */
14707 0, /* tp_call */
14708 0, /* tp_str */
14709 PyObject_GenericGetAttr, /* tp_getattro */
14710 0, /* tp_setattro */
14711 0, /* tp_as_buffer */
14712 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14713 0, /* tp_doc */
14714 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14715 0, /* tp_clear */
14716 0, /* tp_richcompare */
14717 0, /* tp_weaklistoffset */
14718 PyObject_SelfIter, /* tp_iter */
14719 (iternextfunc)unicodeiter_next, /* tp_iternext */
14720 unicodeiter_methods, /* tp_methods */
14721 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014722};
14723
14724static PyObject *
14725unicode_iter(PyObject *seq)
14726{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014727 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014728
Benjamin Peterson14339b62009-01-31 16:36:08 +000014729 if (!PyUnicode_Check(seq)) {
14730 PyErr_BadInternalCall();
14731 return NULL;
14732 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014733 if (PyUnicode_READY(seq) == -1)
14734 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014735 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14736 if (it == NULL)
14737 return NULL;
14738 it->it_index = 0;
14739 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014740 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014741 _PyObject_GC_TRACK(it);
14742 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014743}
14744
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014745
14746size_t
14747Py_UNICODE_strlen(const Py_UNICODE *u)
14748{
14749 int res = 0;
14750 while(*u++)
14751 res++;
14752 return res;
14753}
14754
14755Py_UNICODE*
14756Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14757{
14758 Py_UNICODE *u = s1;
14759 while ((*u++ = *s2++));
14760 return s1;
14761}
14762
14763Py_UNICODE*
14764Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14765{
14766 Py_UNICODE *u = s1;
14767 while ((*u++ = *s2++))
14768 if (n-- == 0)
14769 break;
14770 return s1;
14771}
14772
14773Py_UNICODE*
14774Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14775{
14776 Py_UNICODE *u1 = s1;
14777 u1 += Py_UNICODE_strlen(u1);
14778 Py_UNICODE_strcpy(u1, s2);
14779 return s1;
14780}
14781
14782int
14783Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14784{
14785 while (*s1 && *s2 && *s1 == *s2)
14786 s1++, s2++;
14787 if (*s1 && *s2)
14788 return (*s1 < *s2) ? -1 : +1;
14789 if (*s1)
14790 return 1;
14791 if (*s2)
14792 return -1;
14793 return 0;
14794}
14795
14796int
14797Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14798{
14799 register Py_UNICODE u1, u2;
14800 for (; n != 0; n--) {
14801 u1 = *s1;
14802 u2 = *s2;
14803 if (u1 != u2)
14804 return (u1 < u2) ? -1 : +1;
14805 if (u1 == '\0')
14806 return 0;
14807 s1++;
14808 s2++;
14809 }
14810 return 0;
14811}
14812
14813Py_UNICODE*
14814Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14815{
14816 const Py_UNICODE *p;
14817 for (p = s; *p; p++)
14818 if (*p == c)
14819 return (Py_UNICODE*)p;
14820 return NULL;
14821}
14822
14823Py_UNICODE*
14824Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14825{
14826 const Py_UNICODE *p;
14827 p = s + Py_UNICODE_strlen(s);
14828 while (p != s) {
14829 p--;
14830 if (*p == c)
14831 return (Py_UNICODE*)p;
14832 }
14833 return NULL;
14834}
Victor Stinner331ea922010-08-10 16:37:20 +000014835
Victor Stinner71133ff2010-09-01 23:43:53 +000014836Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014837PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014838{
Victor Stinner577db2c2011-10-11 22:12:48 +020014839 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014840 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014842 if (!PyUnicode_Check(unicode)) {
14843 PyErr_BadArgument();
14844 return NULL;
14845 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014846 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014847 if (u == NULL)
14848 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014849 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014850 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014851 PyErr_NoMemory();
14852 return NULL;
14853 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014854 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014855 size *= sizeof(Py_UNICODE);
14856 copy = PyMem_Malloc(size);
14857 if (copy == NULL) {
14858 PyErr_NoMemory();
14859 return NULL;
14860 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014861 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014862 return copy;
14863}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014864
Georg Brandl66c221e2010-10-14 07:04:07 +000014865/* A _string module, to export formatter_parser and formatter_field_name_split
14866 to the string.Formatter class implemented in Python. */
14867
14868static PyMethodDef _string_methods[] = {
14869 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14870 METH_O, PyDoc_STR("split the argument as a field name")},
14871 {"formatter_parser", (PyCFunction) formatter_parser,
14872 METH_O, PyDoc_STR("parse the argument as a format string")},
14873 {NULL, NULL}
14874};
14875
14876static struct PyModuleDef _string_module = {
14877 PyModuleDef_HEAD_INIT,
14878 "_string",
14879 PyDoc_STR("string helper module"),
14880 0,
14881 _string_methods,
14882 NULL,
14883 NULL,
14884 NULL,
14885 NULL
14886};
14887
14888PyMODINIT_FUNC
14889PyInit__string(void)
14890{
14891 return PyModule_Create(&_string_module);
14892}
14893
14894
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014895#ifdef __cplusplus
14896}
14897#endif