blob: d816a465498af663c30652bb31410c131b560df5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinnere6abb482012-05-02 01:15:40 +0200114/* Optimized version of Py_MAX() to compute the maximum character:
115 use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2) \
117 ((maxchar1) | (maxchar2))
118
Victor Stinner910337b2011-10-03 03:20:16 +0200119#undef PyUnicode_READY
120#define PyUnicode_READY(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200123 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100124 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
182static PyObject *interned;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200185static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000186
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* Single character Unicode strings in the Latin-1 range are being
191 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200192static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193
Christian Heimes190d79e2008-01-30 11:58:22 +0000194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000199/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000200/* case 0x000C: * FORM FEED */
201/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 1, 1, 1, 1, 1, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x001C: * FILE SEPARATOR */
205/* case 0x001D: * GROUP SEPARATOR */
206/* case 0x001E: * RECORD SEPARATOR */
207/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 1, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000214
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000223};
224
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200225/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200227static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200228static void copy_characters(
229 PyObject *to, Py_ssize_t to_start,
230 PyObject *from, Py_ssize_t from_start,
231 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100232static int unicode_modifiable(PyObject *unicode);
233
Victor Stinnerfe226c02011-10-03 03:52:20 +0200234
Alexander Belopolsky40018472011-02-26 01:02:56 +0000235static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200236unicode_fromascii(const unsigned char *s, Py_ssize_t size);
237static PyObject *
238_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
239static PyObject *
240_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
243
244static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000246 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100247 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000248 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
249
Alexander Belopolsky40018472011-02-26 01:02:56 +0000250static void
251raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300252 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100253 PyObject *unicode,
254 Py_ssize_t startpos, Py_ssize_t endpos,
255 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000256
Christian Heimes190d79e2008-01-30 11:58:22 +0000257/* Same for linebreaks */
258static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000261/* 0x000B, * LINE TABULATION */
262/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000263/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000264 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x001C, * FILE SEPARATOR */
267/* 0x001D, * GROUP SEPARATOR */
268/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 1, 1, 1, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000274
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000283};
284
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300285/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
286 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000288PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000289{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000290#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000292#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000293 /* This is actually an illegal character, so it should
294 not be passed to unichr. */
295 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296#endif
297}
298
Victor Stinner910337b2011-10-03 03:20:16 +0200299#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200300int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100301_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200302{
303 PyASCIIObject *ascii;
304 unsigned int kind;
305
306 assert(PyUnicode_Check(op));
307
308 ascii = (PyASCIIObject *)op;
309 kind = ascii->state.kind;
310
Victor Stinnera3b334d2011-10-03 13:53:37 +0200311 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200312 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(ascii->state.ready == 1);
314 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200315 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200316 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200317 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200318
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 if (ascii->state.compact == 1) {
320 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(kind == PyUnicode_1BYTE_KIND
322 || kind == PyUnicode_2BYTE_KIND
323 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200326 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 }
328 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
330
331 data = unicode->data.any;
332 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->length == 0);
334 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200335 assert(ascii->state.compact == 0);
336 assert(ascii->state.ascii == 0);
337 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100338 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 assert(ascii->wstr != NULL);
340 assert(data == NULL);
341 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200342 }
343 else {
344 assert(kind == PyUnicode_1BYTE_KIND
345 || kind == PyUnicode_2BYTE_KIND
346 || kind == PyUnicode_4BYTE_KIND);
347 assert(ascii->state.compact == 0);
348 assert(ascii->state.ready == 1);
349 assert(data != NULL);
350 if (ascii->state.ascii) {
351 assert (compact->utf8 == data);
352 assert (compact->utf8_length == ascii->length);
353 }
354 else
355 assert (compact->utf8 != data);
356 }
357 }
358 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200359 if (
360#if SIZEOF_WCHAR_T == 2
361 kind == PyUnicode_2BYTE_KIND
362#else
363 kind == PyUnicode_4BYTE_KIND
364#endif
365 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200366 {
367 assert(ascii->wstr == data);
368 assert(compact->wstr_length == ascii->length);
369 } else
370 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200372
373 if (compact->utf8 == NULL)
374 assert(compact->utf8_length == 0);
375 if (ascii->wstr == NULL)
376 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200377 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200378 /* check that the best kind is used */
379 if (check_content && kind != PyUnicode_WCHAR_KIND)
380 {
381 Py_ssize_t i;
382 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200383 void *data;
384 Py_UCS4 ch;
385
386 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 for (i=0; i < ascii->length; i++)
388 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200389 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 if (ch > maxchar)
391 maxchar = ch;
392 }
393 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100394 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100396 assert(maxchar <= 255);
397 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 else
399 assert(maxchar < 128);
400 }
Victor Stinner77faf692011-11-20 18:56:05 +0100401 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200402 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100403 assert(maxchar <= 0xFFFF);
404 }
405 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100407 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100408 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200409 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400411 return 1;
412}
Victor Stinner910337b2011-10-03 03:20:16 +0200413#endif
414
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100415static PyObject*
416unicode_result_wchar(PyObject *unicode)
417{
418#ifndef Py_DEBUG
419 Py_ssize_t len;
420
421 assert(Py_REFCNT(unicode) == 1);
422
423 len = _PyUnicode_WSTR_LENGTH(unicode);
424 if (len == 0) {
425 Py_INCREF(unicode_empty);
426 Py_DECREF(unicode);
427 return unicode_empty;
428 }
429
430 if (len == 1) {
431 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
432 if (ch < 256) {
433 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
434 Py_DECREF(unicode);
435 return latin1_char;
436 }
437 }
438
439 if (_PyUnicode_Ready(unicode) < 0) {
440 Py_XDECREF(unicode);
441 return NULL;
442 }
443#else
444 /* don't make the result ready in debug mode to ensure that the caller
445 makes the string ready before using it */
446 assert(_PyUnicode_CheckConsistency(unicode, 1));
447#endif
448 return unicode;
449}
450
451static PyObject*
452unicode_result_ready(PyObject *unicode)
453{
454 Py_ssize_t length;
455
456 length = PyUnicode_GET_LENGTH(unicode);
457 if (length == 0) {
458 if (unicode != unicode_empty) {
459 Py_INCREF(unicode_empty);
460 Py_DECREF(unicode);
461 }
462 return unicode_empty;
463 }
464
465 if (length == 1) {
466 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
467 if (ch < 256) {
468 PyObject *latin1_char = unicode_latin1[ch];
469 if (latin1_char != NULL) {
470 if (unicode != latin1_char) {
471 Py_INCREF(latin1_char);
472 Py_DECREF(unicode);
473 }
474 return latin1_char;
475 }
476 else {
477 assert(_PyUnicode_CheckConsistency(unicode, 1));
478 Py_INCREF(unicode);
479 unicode_latin1[ch] = unicode;
480 return unicode;
481 }
482 }
483 }
484
485 assert(_PyUnicode_CheckConsistency(unicode, 1));
486 return unicode;
487}
488
489static PyObject*
490unicode_result(PyObject *unicode)
491{
492 assert(_PyUnicode_CHECK(unicode));
493 if (PyUnicode_IS_READY(unicode))
494 return unicode_result_ready(unicode);
495 else
496 return unicode_result_wchar(unicode);
497}
498
Victor Stinnerc4b49542011-12-11 22:44:26 +0100499static PyObject*
500unicode_result_unchanged(PyObject *unicode)
501{
502 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500503 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100504 return NULL;
505 Py_INCREF(unicode);
506 return unicode;
507 }
508 else
509 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100510 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100511}
512
Victor Stinner3a50e702011-10-18 21:21:00 +0200513#ifdef HAVE_MBCS
514static OSVERSIONINFOEX winver;
515#endif
516
Thomas Wouters477c8d52006-05-27 19:21:47 +0000517/* --- Bloom Filters ----------------------------------------------------- */
518
519/* stuff to implement simple "bloom filters" for Unicode characters.
520 to keep things simple, we use a single bitmask, using the least 5
521 bits from each unicode characters as the bit index. */
522
523/* the linebreak mask is set up by Unicode_Init below */
524
Antoine Pitrouf068f942010-01-13 14:19:12 +0000525#if LONG_BIT >= 128
526#define BLOOM_WIDTH 128
527#elif LONG_BIT >= 64
528#define BLOOM_WIDTH 64
529#elif LONG_BIT >= 32
530#define BLOOM_WIDTH 32
531#else
532#error "LONG_BIT is smaller than 32"
533#endif
534
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535#define BLOOM_MASK unsigned long
536
537static BLOOM_MASK bloom_linebreak;
538
Antoine Pitrouf068f942010-01-13 14:19:12 +0000539#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
540#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000541
Benjamin Peterson29060642009-01-31 22:14:21 +0000542#define BLOOM_LINEBREAK(ch) \
543 ((ch) < 128U ? ascii_linebreak[(ch)] : \
544 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Alexander Belopolsky40018472011-02-26 01:02:56 +0000546Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548{
549 /* calculate simple bloom-style bitmask for a given unicode string */
550
Antoine Pitrouf068f942010-01-13 14:19:12 +0000551 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000552 Py_ssize_t i;
553
554 mask = 0;
555 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200556 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000557
558 return mask;
559}
560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561#define BLOOM_MEMBER(mask, chr, str) \
562 (BLOOM(mask, chr) \
563 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000564
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200565/* Compilation of templated routines */
566
567#include "stringlib/asciilib.h"
568#include "stringlib/fastsearch.h"
569#include "stringlib/partition.h"
570#include "stringlib/split.h"
571#include "stringlib/count.h"
572#include "stringlib/find.h"
573#include "stringlib/find_max_char.h"
574#include "stringlib/localeutil.h"
575#include "stringlib/undef.h"
576
577#include "stringlib/ucs1lib.h"
578#include "stringlib/fastsearch.h"
579#include "stringlib/partition.h"
580#include "stringlib/split.h"
581#include "stringlib/count.h"
582#include "stringlib/find.h"
583#include "stringlib/find_max_char.h"
584#include "stringlib/localeutil.h"
585#include "stringlib/undef.h"
586
587#include "stringlib/ucs2lib.h"
588#include "stringlib/fastsearch.h"
589#include "stringlib/partition.h"
590#include "stringlib/split.h"
591#include "stringlib/count.h"
592#include "stringlib/find.h"
593#include "stringlib/find_max_char.h"
594#include "stringlib/localeutil.h"
595#include "stringlib/undef.h"
596
597#include "stringlib/ucs4lib.h"
598#include "stringlib/fastsearch.h"
599#include "stringlib/partition.h"
600#include "stringlib/split.h"
601#include "stringlib/count.h"
602#include "stringlib/find.h"
603#include "stringlib/find_max_char.h"
604#include "stringlib/localeutil.h"
605#include "stringlib/undef.h"
606
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200607#include "stringlib/unicodedefs.h"
608#include "stringlib/fastsearch.h"
609#include "stringlib/count.h"
610#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100611#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612
Guido van Rossumd57fd912000-03-10 22:53:23 +0000613/* --- Unicode Object ----------------------------------------------------- */
614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200616fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200617
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200618Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
619 Py_ssize_t size, Py_UCS4 ch,
620 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200621{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200622 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
623
624 switch (kind) {
625 case PyUnicode_1BYTE_KIND:
626 {
627 Py_UCS1 ch1 = (Py_UCS1) ch;
628 if (ch1 == ch)
629 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
630 else
631 return -1;
632 }
633 case PyUnicode_2BYTE_KIND:
634 {
635 Py_UCS2 ch2 = (Py_UCS2) ch;
636 if (ch2 == ch)
637 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
638 else
639 return -1;
640 }
641 case PyUnicode_4BYTE_KIND:
642 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
643 default:
644 assert(0);
645 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647}
648
Victor Stinnerfe226c02011-10-03 03:52:20 +0200649static PyObject*
650resize_compact(PyObject *unicode, Py_ssize_t length)
651{
652 Py_ssize_t char_size;
653 Py_ssize_t struct_size;
654 Py_ssize_t new_size;
655 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100656 PyObject *new_unicode;
Victor Stinner79891572012-05-03 13:43:07 +0200657 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100659 assert(PyUnicode_IS_COMPACT(unicode));
660
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200661 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100662 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200663 struct_size = sizeof(PyASCIIObject);
664 else
665 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200666 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200667
Victor Stinnerfe226c02011-10-03 03:52:20 +0200668 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
669 PyErr_NoMemory();
670 return NULL;
671 }
672 new_size = (struct_size + (length + 1) * char_size);
673
Victor Stinner84def372011-12-11 20:04:56 +0100674 _Py_DEC_REFTOTAL;
675 _Py_ForgetReference(unicode);
676
677 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
678 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100679 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680 PyErr_NoMemory();
681 return NULL;
682 }
Victor Stinner84def372011-12-11 20:04:56 +0100683 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100685
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200687 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200688 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100689 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200690 _PyUnicode_WSTR_LENGTH(unicode) = length;
691 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
693 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200694 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 return unicode;
696}
697
Alexander Belopolsky40018472011-02-26 01:02:56 +0000698static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200699resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000700{
Victor Stinner95663112011-10-04 01:03:50 +0200701 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100702 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200703 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000705
Victor Stinnerfe226c02011-10-03 03:52:20 +0200706 if (PyUnicode_IS_READY(unicode)) {
707 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200708 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 void *data;
710
711 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200712 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200713 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
714 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715
716 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
717 PyErr_NoMemory();
718 return -1;
719 }
720 new_size = (length + 1) * char_size;
721
Victor Stinner7a9105a2011-12-12 00:13:42 +0100722 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
723 {
724 PyObject_DEL(_PyUnicode_UTF8(unicode));
725 _PyUnicode_UTF8(unicode) = NULL;
726 _PyUnicode_UTF8_LENGTH(unicode) = 0;
727 }
728
Victor Stinnerfe226c02011-10-03 03:52:20 +0200729 data = (PyObject *)PyObject_REALLOC(data, new_size);
730 if (data == NULL) {
731 PyErr_NoMemory();
732 return -1;
733 }
734 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200735 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200737 _PyUnicode_WSTR_LENGTH(unicode) = length;
738 }
739 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200740 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200741 _PyUnicode_UTF8_LENGTH(unicode) = length;
742 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200743 _PyUnicode_LENGTH(unicode) = length;
744 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200745 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200746 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200747 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200748 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200749 }
Victor Stinner95663112011-10-04 01:03:50 +0200750 assert(_PyUnicode_WSTR(unicode) != NULL);
751
752 /* check for integer overflow */
753 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
754 PyErr_NoMemory();
755 return -1;
756 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100757 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200758 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100759 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200760 if (!wstr) {
761 PyErr_NoMemory();
762 return -1;
763 }
764 _PyUnicode_WSTR(unicode) = wstr;
765 _PyUnicode_WSTR(unicode)[length] = 0;
766 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200767 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000768 return 0;
769}
770
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771static PyObject*
772resize_copy(PyObject *unicode, Py_ssize_t length)
773{
774 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100775 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100777
Benjamin Petersonbac79492012-01-14 13:34:47 -0500778 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100779 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200780
781 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
782 if (copy == NULL)
783 return NULL;
784
785 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200786 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200787 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200788 }
789 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200790 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100791
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200792 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200793 if (w == NULL)
794 return NULL;
795 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
796 copy_length = Py_MIN(copy_length, length);
797 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
798 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200799 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200800 }
801}
802
Guido van Rossumd57fd912000-03-10 22:53:23 +0000803/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000804 Ux0000 terminated; some code (e.g. new_identifier)
805 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000806
807 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000808 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809
810*/
811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200812#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200813static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200814#endif
815
Alexander Belopolsky40018472011-02-26 01:02:56 +0000816static PyUnicodeObject *
817_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000818{
819 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821
Thomas Wouters477c8d52006-05-27 19:21:47 +0000822 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823 if (length == 0 && unicode_empty != NULL) {
824 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200825 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000826 }
827
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000828 /* Ensure we won't overflow the size. */
829 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
830 return (PyUnicodeObject *)PyErr_NoMemory();
831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832 if (length < 0) {
833 PyErr_SetString(PyExc_SystemError,
834 "Negative size passed to _PyUnicode_New");
835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000836 }
837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200838#ifdef Py_DEBUG
839 ++unicode_old_new_calls;
840#endif
841
842 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
843 if (unicode == NULL)
844 return NULL;
845 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
846 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
847 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100848 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000849 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100850 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000851 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852
Jeremy Hyltond8082792003-09-16 19:41:39 +0000853 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000854 * the caller fails before initializing str -- unicode_resize()
855 * reads str[0], and the Keep-Alive optimization can keep memory
856 * allocated for str alive across a call to unicode_dealloc(unicode).
857 * We don't want unicode_resize to read uninitialized memory in
858 * that case.
859 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200860 _PyUnicode_WSTR(unicode)[0] = 0;
861 _PyUnicode_WSTR(unicode)[length] = 0;
862 _PyUnicode_WSTR_LENGTH(unicode) = length;
863 _PyUnicode_HASH(unicode) = -1;
864 _PyUnicode_STATE(unicode).interned = 0;
865 _PyUnicode_STATE(unicode).kind = 0;
866 _PyUnicode_STATE(unicode).compact = 0;
867 _PyUnicode_STATE(unicode).ready = 0;
868 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200869 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200870 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200871 _PyUnicode_UTF8(unicode) = NULL;
872 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100873 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000874 return unicode;
875}
876
Victor Stinnerf42dc442011-10-02 23:33:16 +0200877static const char*
878unicode_kind_name(PyObject *unicode)
879{
Victor Stinner42dfd712011-10-03 14:41:45 +0200880 /* don't check consistency: unicode_kind_name() is called from
881 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200882 if (!PyUnicode_IS_COMPACT(unicode))
883 {
884 if (!PyUnicode_IS_READY(unicode))
885 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600886 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887 {
888 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200889 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200890 return "legacy ascii";
891 else
892 return "legacy latin1";
893 case PyUnicode_2BYTE_KIND:
894 return "legacy UCS2";
895 case PyUnicode_4BYTE_KIND:
896 return "legacy UCS4";
897 default:
898 return "<legacy invalid kind>";
899 }
900 }
901 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600902 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200903 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200904 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200905 return "ascii";
906 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200907 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200908 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200909 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200910 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200911 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200912 default:
913 return "<invalid compact kind>";
914 }
915}
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200918static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200919
920/* Functions wrapping macros for use in debugger */
921char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200922 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923}
924
925void *_PyUnicode_compact_data(void *unicode) {
926 return _PyUnicode_COMPACT_DATA(unicode);
927}
928void *_PyUnicode_data(void *unicode){
929 printf("obj %p\n", unicode);
930 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
931 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
932 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
933 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
934 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
935 return PyUnicode_DATA(unicode);
936}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200937
938void
939_PyUnicode_Dump(PyObject *op)
940{
941 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
943 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
944 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200947 {
948 if (ascii->state.ascii)
949 data = (ascii + 1);
950 else
951 data = (compact + 1);
952 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200953 else
954 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200955 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
956
Victor Stinnera849a4b2011-10-03 12:12:11 +0200957 if (ascii->wstr == data)
958 printf("shared ");
959 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200960
Victor Stinnera3b334d2011-10-03 13:53:37 +0200961 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200962 printf(" (%zu), ", compact->wstr_length);
963 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
964 printf("shared ");
965 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200966 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200967 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200968}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969#endif
970
971PyObject *
972PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
973{
974 PyObject *obj;
975 PyCompactUnicodeObject *unicode;
976 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200977 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200978 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 Py_ssize_t char_size;
980 Py_ssize_t struct_size;
981
982 /* Optimization for empty strings */
983 if (size == 0 && unicode_empty != NULL) {
984 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200985 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200986 }
987
988#ifdef Py_DEBUG
989 ++unicode_new_new_calls;
990#endif
991
Victor Stinner9e9d6892011-10-04 01:02:02 +0200992 is_ascii = 0;
993 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200994 struct_size = sizeof(PyCompactUnicodeObject);
995 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200996 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200997 char_size = 1;
998 is_ascii = 1;
999 struct_size = sizeof(PyASCIIObject);
1000 }
1001 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001002 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001003 char_size = 1;
1004 }
1005 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001006 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001007 char_size = 2;
1008 if (sizeof(wchar_t) == 2)
1009 is_sharing = 1;
1010 }
1011 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001012 if (maxchar > MAX_UNICODE) {
1013 PyErr_SetString(PyExc_SystemError,
1014 "invalid maximum character passed to PyUnicode_New");
1015 return NULL;
1016 }
Victor Stinner8f825062012-04-27 13:55:39 +02001017 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001018 char_size = 4;
1019 if (sizeof(wchar_t) == 4)
1020 is_sharing = 1;
1021 }
1022
1023 /* Ensure we won't overflow the size. */
1024 if (size < 0) {
1025 PyErr_SetString(PyExc_SystemError,
1026 "Negative size passed to PyUnicode_New");
1027 return NULL;
1028 }
1029 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1030 return PyErr_NoMemory();
1031
1032 /* Duplicated allocation code from _PyObject_New() instead of a call to
1033 * PyObject_New() so we are able to allocate space for the object and
1034 * it's data buffer.
1035 */
1036 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1037 if (obj == NULL)
1038 return PyErr_NoMemory();
1039 obj = PyObject_INIT(obj, &PyUnicode_Type);
1040 if (obj == NULL)
1041 return NULL;
1042
1043 unicode = (PyCompactUnicodeObject *)obj;
1044 if (is_ascii)
1045 data = ((PyASCIIObject*)obj) + 1;
1046 else
1047 data = unicode + 1;
1048 _PyUnicode_LENGTH(unicode) = size;
1049 _PyUnicode_HASH(unicode) = -1;
1050 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001051 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 _PyUnicode_STATE(unicode).compact = 1;
1053 _PyUnicode_STATE(unicode).ready = 1;
1054 _PyUnicode_STATE(unicode).ascii = is_ascii;
1055 if (is_ascii) {
1056 ((char*)data)[size] = 0;
1057 _PyUnicode_WSTR(unicode) = NULL;
1058 }
Victor Stinner8f825062012-04-27 13:55:39 +02001059 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 ((char*)data)[size] = 0;
1061 _PyUnicode_WSTR(unicode) = NULL;
1062 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001064 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001065 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066 else {
1067 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001068 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001069 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001071 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 ((Py_UCS4*)data)[size] = 0;
1073 if (is_sharing) {
1074 _PyUnicode_WSTR_LENGTH(unicode) = size;
1075 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1076 }
1077 else {
1078 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1079 _PyUnicode_WSTR(unicode) = NULL;
1080 }
1081 }
Victor Stinner8f825062012-04-27 13:55:39 +02001082#ifdef Py_DEBUG
1083 /* Fill the data with invalid characters to detect bugs earlier.
1084 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1085 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1086 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1087 memset(data, 0xff, size * kind);
1088#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001089 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 return obj;
1091}
1092
1093#if SIZEOF_WCHAR_T == 2
1094/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1095 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001096 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097
1098 This function assumes that unicode can hold one more code point than wstr
1099 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001100static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103{
1104 const wchar_t *iter;
1105 Py_UCS4 *ucs4_out;
1106
Victor Stinner910337b2011-10-03 03:20:16 +02001107 assert(unicode != NULL);
1108 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1110 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1111
1112 for (iter = begin; iter < end; ) {
1113 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1114 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001115 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1116 && (iter+1) < end
1117 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 {
Victor Stinner551ac952011-11-29 22:58:13 +01001119 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 iter += 2;
1121 }
1122 else {
1123 *ucs4_out++ = *iter;
1124 iter++;
1125 }
1126 }
1127 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1128 _PyUnicode_GET_LENGTH(unicode)));
1129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130}
1131#endif
1132
Victor Stinnercd9950f2011-10-02 00:34:53 +02001133static int
Victor Stinner488fa492011-12-12 00:01:39 +01001134unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001135{
Victor Stinner488fa492011-12-12 00:01:39 +01001136 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001137 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001138 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001139 return -1;
1140 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001141 return 0;
1142}
1143
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001144static int
1145_copy_characters(PyObject *to, Py_ssize_t to_start,
1146 PyObject *from, Py_ssize_t from_start,
1147 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001149 unsigned int from_kind, to_kind;
1150 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001151 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001153 assert(PyUnicode_Check(from));
1154 assert(PyUnicode_Check(to));
1155 assert(PyUnicode_IS_READY(from));
1156 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001157
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001158 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1159 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1160 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001162 if (how_many == 0)
1163 return 0;
1164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001166 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001168 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001170#ifdef Py_DEBUG
1171 if (!check_maxchar
1172 && (from_kind > to_kind
1173 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001174 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001175 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1176 Py_UCS4 ch;
1177 Py_ssize_t i;
1178 for (i=0; i < how_many; i++) {
1179 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1180 assert(ch <= to_maxchar);
1181 }
1182 }
1183#endif
1184 fast = (from_kind == to_kind);
1185 if (check_maxchar
1186 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1187 {
1188 /* deny latin1 => ascii */
1189 fast = 0;
1190 }
1191
1192 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001193 Py_MEMCPY((char*)to_data + to_kind * to_start,
1194 (char*)from_data + from_kind * from_start,
1195 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001196 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001197 else if (from_kind == PyUnicode_1BYTE_KIND
1198 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001199 {
1200 _PyUnicode_CONVERT_BYTES(
1201 Py_UCS1, Py_UCS2,
1202 PyUnicode_1BYTE_DATA(from) + from_start,
1203 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1204 PyUnicode_2BYTE_DATA(to) + to_start
1205 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001206 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001207 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001208 && to_kind == PyUnicode_4BYTE_KIND)
1209 {
1210 _PyUnicode_CONVERT_BYTES(
1211 Py_UCS1, Py_UCS4,
1212 PyUnicode_1BYTE_DATA(from) + from_start,
1213 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1214 PyUnicode_4BYTE_DATA(to) + to_start
1215 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001216 }
1217 else if (from_kind == PyUnicode_2BYTE_KIND
1218 && to_kind == PyUnicode_4BYTE_KIND)
1219 {
1220 _PyUnicode_CONVERT_BYTES(
1221 Py_UCS2, Py_UCS4,
1222 PyUnicode_2BYTE_DATA(from) + from_start,
1223 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1224 PyUnicode_4BYTE_DATA(to) + to_start
1225 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001226 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001227 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001228 /* check if max_char(from substring) <= max_char(to) */
1229 if (from_kind > to_kind
1230 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001231 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001232 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001233 /* slow path to check for character overflow */
1234 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001235 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001236 Py_ssize_t i;
1237
Victor Stinner56c161a2011-10-06 02:47:11 +02001238#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001239 for (i=0; i < how_many; i++) {
1240 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001241 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001242 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1243 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001244#else
1245 if (!check_maxchar) {
1246 for (i=0; i < how_many; i++) {
1247 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1248 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1249 }
1250 }
1251 else {
1252 for (i=0; i < how_many; i++) {
1253 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1254 if (ch > to_maxchar)
1255 return 1;
1256 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1257 }
1258 }
1259#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001260 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001261 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001262 assert(0 && "inconsistent state");
1263 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001264 }
1265 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001266 return 0;
1267}
1268
1269static void
1270copy_characters(PyObject *to, Py_ssize_t to_start,
1271 PyObject *from, Py_ssize_t from_start,
1272 Py_ssize_t how_many)
1273{
1274 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1275}
1276
1277Py_ssize_t
1278PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1279 PyObject *from, Py_ssize_t from_start,
1280 Py_ssize_t how_many)
1281{
1282 int err;
1283
1284 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1285 PyErr_BadInternalCall();
1286 return -1;
1287 }
1288
Benjamin Petersonbac79492012-01-14 13:34:47 -05001289 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001290 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001291 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001292 return -1;
1293
1294 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1295 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1296 PyErr_Format(PyExc_SystemError,
1297 "Cannot write %zi characters at %zi "
1298 "in a string of %zi characters",
1299 how_many, to_start, PyUnicode_GET_LENGTH(to));
1300 return -1;
1301 }
1302
1303 if (how_many == 0)
1304 return 0;
1305
Victor Stinner488fa492011-12-12 00:01:39 +01001306 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001307 return -1;
1308
1309 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1310 if (err) {
1311 PyErr_Format(PyExc_SystemError,
1312 "Cannot copy %s characters "
1313 "into a string of %s characters",
1314 unicode_kind_name(from),
1315 unicode_kind_name(to));
1316 return -1;
1317 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001318 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319}
1320
Victor Stinner17222162011-09-28 22:15:37 +02001321/* Find the maximum code point and count the number of surrogate pairs so a
1322 correct string length can be computed before converting a string to UCS4.
1323 This function counts single surrogates as a character and not as a pair.
1324
1325 Return 0 on success, or -1 on error. */
1326static int
1327find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1328 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001329{
1330 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001331 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332
Victor Stinnerc53be962011-10-02 21:33:54 +02001333 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334 *num_surrogates = 0;
1335 *maxchar = 0;
1336
1337 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001338#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001339 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1340 && (iter+1) < end
1341 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001343 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345 iter += 2;
1346 }
1347 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001348#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001349 {
1350 ch = *iter;
1351 iter++;
1352 }
1353 if (ch > *maxchar) {
1354 *maxchar = ch;
1355 if (*maxchar > MAX_UNICODE) {
1356 PyErr_Format(PyExc_ValueError,
1357 "character U+%x is not in range [U+0000; U+10ffff]",
1358 ch);
1359 return -1;
1360 }
1361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362 }
1363 return 0;
1364}
1365
1366#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001367static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368#endif
1369
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001370int
1371_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372{
1373 wchar_t *end;
1374 Py_UCS4 maxchar = 0;
1375 Py_ssize_t num_surrogates;
1376#if SIZEOF_WCHAR_T == 2
1377 Py_ssize_t length_wo_surrogates;
1378#endif
1379
Georg Brandl7597add2011-10-05 16:36:47 +02001380 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001381 strings were created using _PyObject_New() and where no canonical
1382 representation (the str field) has been set yet aka strings
1383 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001384 assert(_PyUnicode_CHECK(unicode));
1385 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001387 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001388 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001389 /* Actually, it should neither be interned nor be anything else: */
1390 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391
1392#ifdef Py_DEBUG
1393 ++unicode_ready_calls;
1394#endif
1395
1396 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001397 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001398 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400
1401 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001402 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1403 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 PyErr_NoMemory();
1405 return -1;
1406 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001407 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 _PyUnicode_WSTR(unicode), end,
1409 PyUnicode_1BYTE_DATA(unicode));
1410 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1411 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1412 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1413 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001414 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001415 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001416 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 }
1418 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001419 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001420 _PyUnicode_UTF8(unicode) = NULL;
1421 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 }
1423 PyObject_FREE(_PyUnicode_WSTR(unicode));
1424 _PyUnicode_WSTR(unicode) = NULL;
1425 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1426 }
1427 /* In this case we might have to convert down from 4-byte native
1428 wchar_t to 2-byte unicode. */
1429 else if (maxchar < 65536) {
1430 assert(num_surrogates == 0 &&
1431 "FindMaxCharAndNumSurrogatePairs() messed up");
1432
Victor Stinner506f5922011-09-28 22:34:18 +02001433#if SIZEOF_WCHAR_T == 2
1434 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001435 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001436 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1437 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1438 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001439 _PyUnicode_UTF8(unicode) = NULL;
1440 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001441#else
1442 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001443 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001444 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001445 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001446 PyErr_NoMemory();
1447 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 }
Victor Stinner506f5922011-09-28 22:34:18 +02001449 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1450 _PyUnicode_WSTR(unicode), end,
1451 PyUnicode_2BYTE_DATA(unicode));
1452 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1453 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1454 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001455 _PyUnicode_UTF8(unicode) = NULL;
1456 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001457 PyObject_FREE(_PyUnicode_WSTR(unicode));
1458 _PyUnicode_WSTR(unicode) = NULL;
1459 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1460#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001461 }
1462 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1463 else {
1464#if SIZEOF_WCHAR_T == 2
1465 /* in case the native representation is 2-bytes, we need to allocate a
1466 new normalized 4-byte version. */
1467 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001468 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1469 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 PyErr_NoMemory();
1471 return -1;
1472 }
1473 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1474 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001475 _PyUnicode_UTF8(unicode) = NULL;
1476 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001477 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1478 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001479 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 PyObject_FREE(_PyUnicode_WSTR(unicode));
1481 _PyUnicode_WSTR(unicode) = NULL;
1482 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1483#else
1484 assert(num_surrogates == 0);
1485
Victor Stinnerc3c74152011-10-02 20:39:55 +02001486 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001488 _PyUnicode_UTF8(unicode) = NULL;
1489 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1491#endif
1492 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1493 }
1494 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001495 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001496 return 0;
1497}
1498
Alexander Belopolsky40018472011-02-26 01:02:56 +00001499static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001500unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501{
Walter Dörwald16807132007-05-25 13:52:07 +00001502 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001503 case SSTATE_NOT_INTERNED:
1504 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001505
Benjamin Peterson29060642009-01-31 22:14:21 +00001506 case SSTATE_INTERNED_MORTAL:
1507 /* revive dead object temporarily for DelItem */
1508 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001509 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001510 Py_FatalError(
1511 "deletion of interned string failed");
1512 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001513
Benjamin Peterson29060642009-01-31 22:14:21 +00001514 case SSTATE_INTERNED_IMMORTAL:
1515 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001516
Benjamin Peterson29060642009-01-31 22:14:21 +00001517 default:
1518 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001519 }
1520
Victor Stinner03490912011-10-03 23:45:12 +02001521 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001522 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001523 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001524 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001525 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1526 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001527
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001528 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529}
1530
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001531#ifdef Py_DEBUG
1532static int
1533unicode_is_singleton(PyObject *unicode)
1534{
1535 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1536 if (unicode == unicode_empty)
1537 return 1;
1538 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1539 {
1540 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1541 if (ch < 256 && unicode_latin1[ch] == unicode)
1542 return 1;
1543 }
1544 return 0;
1545}
1546#endif
1547
Alexander Belopolsky40018472011-02-26 01:02:56 +00001548static int
Victor Stinner488fa492011-12-12 00:01:39 +01001549unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001550{
Victor Stinner488fa492011-12-12 00:01:39 +01001551 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001552 if (Py_REFCNT(unicode) != 1)
1553 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001554 if (_PyUnicode_HASH(unicode) != -1)
1555 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001556 if (PyUnicode_CHECK_INTERNED(unicode))
1557 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001558 if (!PyUnicode_CheckExact(unicode))
1559 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001560#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001561 /* singleton refcount is greater than 1 */
1562 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001563#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001564 return 1;
1565}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001566
Victor Stinnerfe226c02011-10-03 03:52:20 +02001567static int
1568unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1569{
1570 PyObject *unicode;
1571 Py_ssize_t old_length;
1572
1573 assert(p_unicode != NULL);
1574 unicode = *p_unicode;
1575
1576 assert(unicode != NULL);
1577 assert(PyUnicode_Check(unicode));
1578 assert(0 <= length);
1579
Victor Stinner910337b2011-10-03 03:20:16 +02001580 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 old_length = PyUnicode_WSTR_LENGTH(unicode);
1582 else
1583 old_length = PyUnicode_GET_LENGTH(unicode);
1584 if (old_length == length)
1585 return 0;
1586
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001587 if (length == 0) {
1588 Py_DECREF(*p_unicode);
1589 *p_unicode = unicode_empty;
1590 Py_INCREF(*p_unicode);
1591 return 0;
1592 }
1593
Victor Stinner488fa492011-12-12 00:01:39 +01001594 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001595 PyObject *copy = resize_copy(unicode, length);
1596 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001597 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001598 Py_DECREF(*p_unicode);
1599 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001600 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001601 }
1602
Victor Stinnerfe226c02011-10-03 03:52:20 +02001603 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001604 PyObject *new_unicode = resize_compact(unicode, length);
1605 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001606 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001607 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001608 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001609 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001610 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001611}
1612
Alexander Belopolsky40018472011-02-26 01:02:56 +00001613int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001614PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001615{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001616 PyObject *unicode;
1617 if (p_unicode == NULL) {
1618 PyErr_BadInternalCall();
1619 return -1;
1620 }
1621 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001622 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 {
1624 PyErr_BadInternalCall();
1625 return -1;
1626 }
1627 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001628}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001629
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001630static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001631unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1632 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001633{
1634 PyObject *result;
1635 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001636 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001637 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1638 return 0;
1639 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1640 maxchar);
1641 if (result == NULL)
1642 return -1;
Victor Stinner1b487b42012-05-03 12:29:04 +02001643 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001644 Py_DECREF(*p_unicode);
1645 *p_unicode = result;
1646 return 0;
1647}
1648
1649static int
1650unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1651 Py_UCS4 ch)
1652{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001653 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001654 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655 return -1;
1656 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1657 PyUnicode_DATA(*p_unicode),
1658 (*pos)++, ch);
1659 return 0;
1660}
1661
Victor Stinnerc5166102012-02-22 13:55:02 +01001662/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1663 Return the length of the input string.
1664
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001665 WARNING: The function doesn't copy the terminating null character and
1666 doesn't check the maximum character (may write a latin1 character in an
1667 ASCII string). */
Victor Stinnerc5166102012-02-22 13:55:02 +01001668static Py_ssize_t
1669unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1670{
1671 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1672 void *data = PyUnicode_DATA(unicode);
1673
1674 switch (kind) {
1675 case PyUnicode_1BYTE_KIND: {
1676 Py_ssize_t len = strlen(str);
1677 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001678 memcpy((char *) data + index, str, len);
Victor Stinnerc5166102012-02-22 13:55:02 +01001679 return len;
1680 }
1681 case PyUnicode_2BYTE_KIND: {
1682 Py_UCS2 *start = (Py_UCS2 *)data + index;
1683 Py_UCS2 *ucs2 = start;
1684 assert(index <= PyUnicode_GET_LENGTH(unicode));
1685
1686 for (; *str; ++ucs2, ++str)
1687 *ucs2 = (Py_UCS2)*str;
1688
1689 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1690 return ucs2 - start;
1691 }
1692 default: {
1693 Py_UCS4 *start = (Py_UCS4 *)data + index;
1694 Py_UCS4 *ucs4 = start;
1695 assert(kind == PyUnicode_4BYTE_KIND);
1696 assert(index <= PyUnicode_GET_LENGTH(unicode));
1697
1698 for (; *str; ++ucs4, ++str)
1699 *ucs4 = (Py_UCS4)*str;
1700
1701 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1702 return ucs4 - start;
1703 }
1704 }
1705}
1706
1707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001708static PyObject*
1709get_latin1_char(unsigned char ch)
1710{
Victor Stinnera464fc12011-10-02 20:39:30 +02001711 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001712 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001713 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001714 if (!unicode)
1715 return NULL;
1716 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001717 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718 unicode_latin1[ch] = unicode;
1719 }
1720 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001721 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001722}
1723
Alexander Belopolsky40018472011-02-26 01:02:56 +00001724PyObject *
1725PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001727 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001728 Py_UCS4 maxchar = 0;
1729 Py_ssize_t num_surrogates;
1730
1731 if (u == NULL)
1732 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001734 /* If the Unicode data is known at construction time, we can apply
1735 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001737 /* Optimization for empty strings */
1738 if (size == 0 && unicode_empty != NULL) {
1739 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001740 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001741 }
Tim Petersced69f82003-09-16 20:30:58 +00001742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 /* Single character Unicode objects in the Latin-1 range are
1744 shared when using this constructor */
1745 if (size == 1 && *u < 256)
1746 return get_latin1_char((unsigned char)*u);
1747
1748 /* If not empty and not single character, copy the Unicode data
1749 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001750 if (find_maxchar_surrogates(u, u + size,
1751 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001752 return NULL;
1753
Victor Stinner8faf8212011-12-08 22:14:11 +01001754 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755 if (!unicode)
1756 return NULL;
1757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 switch (PyUnicode_KIND(unicode)) {
1759 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001760 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1762 break;
1763 case PyUnicode_2BYTE_KIND:
1764#if Py_UNICODE_SIZE == 2
1765 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1766#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001767 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1769#endif
1770 break;
1771 case PyUnicode_4BYTE_KIND:
1772#if SIZEOF_WCHAR_T == 2
1773 /* This is the only case which has to process surrogates, thus
1774 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001775 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776#else
1777 assert(num_surrogates == 0);
1778 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1779#endif
1780 break;
1781 default:
1782 assert(0 && "Impossible state");
1783 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001785 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786}
1787
Alexander Belopolsky40018472011-02-26 01:02:56 +00001788PyObject *
1789PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001790{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001791 if (size < 0) {
1792 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001793 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001794 return NULL;
1795 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001796 if (u != NULL)
1797 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1798 else
1799 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001800}
1801
Alexander Belopolsky40018472011-02-26 01:02:56 +00001802PyObject *
1803PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001804{
1805 size_t size = strlen(u);
1806 if (size > PY_SSIZE_T_MAX) {
1807 PyErr_SetString(PyExc_OverflowError, "input too long");
1808 return NULL;
1809 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001810 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001811}
1812
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001813PyObject *
1814_PyUnicode_FromId(_Py_Identifier *id)
1815{
1816 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001817 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1818 strlen(id->string),
1819 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001820 if (!id->object)
1821 return NULL;
1822 PyUnicode_InternInPlace(&id->object);
1823 assert(!id->next);
1824 id->next = static_strings;
1825 static_strings = id;
1826 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001827 return id->object;
1828}
1829
1830void
1831_PyUnicode_ClearStaticStrings()
1832{
1833 _Py_Identifier *i;
1834 for (i = static_strings; i; i = i->next) {
1835 Py_DECREF(i->object);
1836 i->object = NULL;
1837 i->next = NULL;
1838 }
1839}
1840
Benjamin Peterson0df54292012-03-26 14:50:32 -04001841/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001842
Victor Stinnere57b1c02011-09-28 22:20:48 +02001843static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001844unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001845{
Victor Stinner785938e2011-12-11 20:09:03 +01001846 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001847 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001848#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001849 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001850#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001851 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001852 }
Victor Stinner785938e2011-12-11 20:09:03 +01001853 unicode = PyUnicode_New(size, 127);
1854 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001855 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001856 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1857 assert(_PyUnicode_CheckConsistency(unicode, 1));
1858 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001859}
1860
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001861static Py_UCS4
1862kind_maxchar_limit(unsigned int kind)
1863{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001864 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001865 case PyUnicode_1BYTE_KIND:
1866 return 0x80;
1867 case PyUnicode_2BYTE_KIND:
1868 return 0x100;
1869 case PyUnicode_4BYTE_KIND:
1870 return 0x10000;
1871 default:
1872 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001873 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001874 }
1875}
1876
Victor Stinnere6abb482012-05-02 01:15:40 +02001877Py_LOCAL_INLINE(Py_UCS4)
1878align_maxchar(Py_UCS4 maxchar)
1879{
1880 if (maxchar <= 127)
1881 return 127;
1882 else if (maxchar <= 255)
1883 return 255;
1884 else if (maxchar <= 65535)
1885 return 65535;
1886 else
1887 return MAX_UNICODE;
1888}
1889
Victor Stinner702c7342011-10-05 13:50:52 +02001890static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001891_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001892{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001893 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001894 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001895
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001896 if (size == 0) {
1897 Py_INCREF(unicode_empty);
1898 return unicode_empty;
1899 }
1900 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001901 if (size == 1)
1902 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001903
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001904 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001905 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001906 if (!res)
1907 return NULL;
1908 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001909 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001910 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001911}
1912
Victor Stinnere57b1c02011-09-28 22:20:48 +02001913static PyObject*
1914_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001915{
1916 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001917 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001918
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001919 if (size == 0) {
1920 Py_INCREF(unicode_empty);
1921 return unicode_empty;
1922 }
1923 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001924 if (size == 1) {
1925 Py_UCS4 ch = u[0];
1926 if (ch < 256)
1927 return get_latin1_char((unsigned char)ch);
1928
1929 res = PyUnicode_New(1, ch);
1930 if (res == NULL)
1931 return NULL;
1932 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1933 assert(_PyUnicode_CheckConsistency(res, 1));
1934 return res;
1935 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001936
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001937 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001938 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 if (!res)
1940 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001941 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001942 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001943 else {
1944 _PyUnicode_CONVERT_BYTES(
1945 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1946 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001947 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948 return res;
1949}
1950
Victor Stinnere57b1c02011-09-28 22:20:48 +02001951static PyObject*
1952_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953{
1954 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001955 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001956
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001957 if (size == 0) {
1958 Py_INCREF(unicode_empty);
1959 return unicode_empty;
1960 }
1961 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001962 if (size == 1) {
1963 Py_UCS4 ch = u[0];
1964 if (ch < 256)
1965 return get_latin1_char((unsigned char)ch);
1966
1967 res = PyUnicode_New(1, ch);
1968 if (res == NULL)
1969 return NULL;
1970 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1971 assert(_PyUnicode_CheckConsistency(res, 1));
1972 return res;
1973 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001974
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001975 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001976 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 if (!res)
1978 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001979 if (max_char < 256)
1980 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1981 PyUnicode_1BYTE_DATA(res));
1982 else if (max_char < 0x10000)
1983 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1984 PyUnicode_2BYTE_DATA(res));
1985 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001987 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988 return res;
1989}
1990
1991PyObject*
1992PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1993{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001994 if (size < 0) {
1995 PyErr_SetString(PyExc_ValueError, "size must be positive");
1996 return NULL;
1997 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001998 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002000 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002002 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002004 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002005 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002006 PyErr_SetString(PyExc_SystemError, "invalid kind");
2007 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009}
2010
Victor Stinnerece58de2012-04-23 23:36:38 +02002011Py_UCS4
2012_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2013{
2014 enum PyUnicode_Kind kind;
2015 void *startptr, *endptr;
2016
2017 assert(PyUnicode_IS_READY(unicode));
2018 assert(0 <= start);
2019 assert(end <= PyUnicode_GET_LENGTH(unicode));
2020 assert(start <= end);
2021
2022 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2023 return PyUnicode_MAX_CHAR_VALUE(unicode);
2024
2025 if (start == end)
2026 return 127;
2027
Victor Stinner94d558b2012-04-27 22:26:58 +02002028 if (PyUnicode_IS_ASCII(unicode))
2029 return 127;
2030
Victor Stinnerece58de2012-04-23 23:36:38 +02002031 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002032 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002033 endptr = (char *)startptr + end * kind;
2034 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002035 switch(kind) {
2036 case PyUnicode_1BYTE_KIND:
2037 return ucs1lib_find_max_char(startptr, endptr);
2038 case PyUnicode_2BYTE_KIND:
2039 return ucs2lib_find_max_char(startptr, endptr);
2040 case PyUnicode_4BYTE_KIND:
2041 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002042 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002043 assert(0);
2044 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002045 }
2046}
2047
Victor Stinner25a4b292011-10-06 12:31:55 +02002048/* Ensure that a string uses the most efficient storage, if it is not the
2049 case: create a new string with of the right kind. Write NULL into *p_unicode
2050 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002051static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002052unicode_adjust_maxchar(PyObject **p_unicode)
2053{
2054 PyObject *unicode, *copy;
2055 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002056 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002057 unsigned int kind;
2058
2059 assert(p_unicode != NULL);
2060 unicode = *p_unicode;
2061 assert(PyUnicode_IS_READY(unicode));
2062 if (PyUnicode_IS_ASCII(unicode))
2063 return;
2064
2065 len = PyUnicode_GET_LENGTH(unicode);
2066 kind = PyUnicode_KIND(unicode);
2067 if (kind == PyUnicode_1BYTE_KIND) {
2068 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002069 max_char = ucs1lib_find_max_char(u, u + len);
2070 if (max_char >= 128)
2071 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002072 }
2073 else if (kind == PyUnicode_2BYTE_KIND) {
2074 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002075 max_char = ucs2lib_find_max_char(u, u + len);
2076 if (max_char >= 256)
2077 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002078 }
2079 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002080 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002081 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002082 max_char = ucs4lib_find_max_char(u, u + len);
2083 if (max_char >= 0x10000)
2084 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002085 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002086 copy = PyUnicode_New(len, max_char);
2087 copy_characters(copy, 0, unicode, 0, len);
2088 Py_DECREF(unicode);
2089 *p_unicode = copy;
2090}
2091
Victor Stinner034f6cf2011-09-30 02:26:44 +02002092PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002093_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002094{
Victor Stinner87af4f22011-11-21 23:03:47 +01002095 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002096 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002097
Victor Stinner034f6cf2011-09-30 02:26:44 +02002098 if (!PyUnicode_Check(unicode)) {
2099 PyErr_BadInternalCall();
2100 return NULL;
2101 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002102 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002103 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002104
Victor Stinner87af4f22011-11-21 23:03:47 +01002105 length = PyUnicode_GET_LENGTH(unicode);
2106 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002107 if (!copy)
2108 return NULL;
2109 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2110
Victor Stinner87af4f22011-11-21 23:03:47 +01002111 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2112 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002113 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002114 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002115}
2116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002117
Victor Stinnerbc603d12011-10-02 01:00:40 +02002118/* Widen Unicode objects to larger buffers. Don't write terminating null
2119 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002120
2121void*
2122_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2123{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002124 Py_ssize_t len;
2125 void *result;
2126 unsigned int skind;
2127
Benjamin Petersonbac79492012-01-14 13:34:47 -05002128 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002129 return NULL;
2130
2131 len = PyUnicode_GET_LENGTH(s);
2132 skind = PyUnicode_KIND(s);
2133 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002134 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002135 return NULL;
2136 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002137 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002138 case PyUnicode_2BYTE_KIND:
2139 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2140 if (!result)
2141 return PyErr_NoMemory();
2142 assert(skind == PyUnicode_1BYTE_KIND);
2143 _PyUnicode_CONVERT_BYTES(
2144 Py_UCS1, Py_UCS2,
2145 PyUnicode_1BYTE_DATA(s),
2146 PyUnicode_1BYTE_DATA(s) + len,
2147 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002149 case PyUnicode_4BYTE_KIND:
2150 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2151 if (!result)
2152 return PyErr_NoMemory();
2153 if (skind == PyUnicode_2BYTE_KIND) {
2154 _PyUnicode_CONVERT_BYTES(
2155 Py_UCS2, Py_UCS4,
2156 PyUnicode_2BYTE_DATA(s),
2157 PyUnicode_2BYTE_DATA(s) + len,
2158 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002159 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002160 else {
2161 assert(skind == PyUnicode_1BYTE_KIND);
2162 _PyUnicode_CONVERT_BYTES(
2163 Py_UCS1, Py_UCS4,
2164 PyUnicode_1BYTE_DATA(s),
2165 PyUnicode_1BYTE_DATA(s) + len,
2166 result);
2167 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002169 default:
2170 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 }
Victor Stinner01698042011-10-04 00:04:26 +02002172 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 return NULL;
2174}
2175
2176static Py_UCS4*
2177as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2178 int copy_null)
2179{
2180 int kind;
2181 void *data;
2182 Py_ssize_t len, targetlen;
2183 if (PyUnicode_READY(string) == -1)
2184 return NULL;
2185 kind = PyUnicode_KIND(string);
2186 data = PyUnicode_DATA(string);
2187 len = PyUnicode_GET_LENGTH(string);
2188 targetlen = len;
2189 if (copy_null)
2190 targetlen++;
2191 if (!target) {
2192 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2193 PyErr_NoMemory();
2194 return NULL;
2195 }
2196 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2197 if (!target) {
2198 PyErr_NoMemory();
2199 return NULL;
2200 }
2201 }
2202 else {
2203 if (targetsize < targetlen) {
2204 PyErr_Format(PyExc_SystemError,
2205 "string is longer than the buffer");
2206 if (copy_null && 0 < targetsize)
2207 target[0] = 0;
2208 return NULL;
2209 }
2210 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002211 if (kind == PyUnicode_1BYTE_KIND) {
2212 Py_UCS1 *start = (Py_UCS1 *) data;
2213 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002215 else if (kind == PyUnicode_2BYTE_KIND) {
2216 Py_UCS2 *start = (Py_UCS2 *) data;
2217 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2218 }
2219 else {
2220 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002222 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 if (copy_null)
2224 target[len] = 0;
2225 return target;
2226}
2227
2228Py_UCS4*
2229PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2230 int copy_null)
2231{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002232 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002233 PyErr_BadInternalCall();
2234 return NULL;
2235 }
2236 return as_ucs4(string, target, targetsize, copy_null);
2237}
2238
2239Py_UCS4*
2240PyUnicode_AsUCS4Copy(PyObject *string)
2241{
2242 return as_ucs4(string, NULL, 0, 1);
2243}
2244
2245#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002246
Alexander Belopolsky40018472011-02-26 01:02:56 +00002247PyObject *
2248PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002251 if (size == 0) {
2252 Py_INCREF(unicode_empty);
2253 return unicode_empty;
2254 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002255 PyErr_BadInternalCall();
2256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002257 }
2258
Martin v. Löwis790465f2008-04-05 20:41:37 +00002259 if (size == -1) {
2260 size = wcslen(w);
2261 }
2262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002264}
2265
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002266#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002267
Walter Dörwald346737f2007-05-31 10:44:43 +00002268static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002269makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2270 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002271{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002272 *fmt++ = '%';
2273 if (width) {
2274 if (zeropad)
2275 *fmt++ = '0';
2276 fmt += sprintf(fmt, "%d", width);
2277 }
2278 if (precision)
2279 fmt += sprintf(fmt, ".%d", precision);
2280 if (longflag)
2281 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002282 else if (longlongflag) {
2283 /* longlongflag should only ever be nonzero on machines with
2284 HAVE_LONG_LONG defined */
2285#ifdef HAVE_LONG_LONG
2286 char *f = PY_FORMAT_LONG_LONG;
2287 while (*f)
2288 *fmt++ = *f++;
2289#else
2290 /* we shouldn't ever get here */
2291 assert(0);
2292 *fmt++ = 'l';
2293#endif
2294 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002295 else if (size_tflag) {
2296 char *f = PY_FORMAT_SIZE_T;
2297 while (*f)
2298 *fmt++ = *f++;
2299 }
2300 *fmt++ = c;
2301 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002302}
2303
Victor Stinner96865452011-03-01 23:44:09 +00002304/* helper for PyUnicode_FromFormatV() */
2305
2306static const char*
2307parse_format_flags(const char *f,
2308 int *p_width, int *p_precision,
2309 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2310{
2311 int width, precision, longflag, longlongflag, size_tflag;
2312
2313 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2314 f++;
2315 width = 0;
2316 while (Py_ISDIGIT((unsigned)*f))
2317 width = (width*10) + *f++ - '0';
2318 precision = 0;
2319 if (*f == '.') {
2320 f++;
2321 while (Py_ISDIGIT((unsigned)*f))
2322 precision = (precision*10) + *f++ - '0';
2323 if (*f == '%') {
2324 /* "%.3%s" => f points to "3" */
2325 f--;
2326 }
2327 }
2328 if (*f == '\0') {
2329 /* bogus format "%.1" => go backward, f points to "1" */
2330 f--;
2331 }
2332 if (p_width != NULL)
2333 *p_width = width;
2334 if (p_precision != NULL)
2335 *p_precision = precision;
2336
2337 /* Handle %ld, %lu, %lld and %llu. */
2338 longflag = 0;
2339 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002340 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002341
2342 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002343 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002344 longflag = 1;
2345 ++f;
2346 }
2347#ifdef HAVE_LONG_LONG
2348 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002349 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002350 longlongflag = 1;
2351 f += 2;
2352 }
2353#endif
2354 }
2355 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002356 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002357 size_tflag = 1;
2358 ++f;
2359 }
2360 if (p_longflag != NULL)
2361 *p_longflag = longflag;
2362 if (p_longlongflag != NULL)
2363 *p_longlongflag = longlongflag;
2364 if (p_size_tflag != NULL)
2365 *p_size_tflag = size_tflag;
2366 return f;
2367}
2368
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002369/* maximum number of characters required for output of %ld. 21 characters
2370 allows for 64-bit integers (in decimal) and an optional sign. */
2371#define MAX_LONG_CHARS 21
2372/* maximum number of characters required for output of %lld.
2373 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2374 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2375#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2376
Walter Dörwaldd2034312007-05-18 16:29:38 +00002377PyObject *
2378PyUnicode_FromFormatV(const char *format, va_list vargs)
2379{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002380 va_list count;
2381 Py_ssize_t callcount = 0;
2382 PyObject **callresults = NULL;
2383 PyObject **callresult = NULL;
2384 Py_ssize_t n = 0;
2385 int width = 0;
2386 int precision = 0;
2387 int zeropad;
2388 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002389 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002390 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002391 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2393 Py_UCS4 argmaxchar;
2394 Py_ssize_t numbersize = 0;
2395 char *numberresults = NULL;
2396 char *numberresult = NULL;
2397 Py_ssize_t i;
2398 int kind;
2399 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002400
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002401 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002402 /* step 1: count the number of %S/%R/%A/%s format specifications
2403 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2404 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002405 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002406 * also estimate a upper bound for all the number formats in the string,
2407 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002409 for (f = format; *f; f++) {
2410 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002411 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2413 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2414 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2415 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002418#ifdef HAVE_LONG_LONG
2419 if (longlongflag) {
2420 if (width < MAX_LONG_LONG_CHARS)
2421 width = MAX_LONG_LONG_CHARS;
2422 }
2423 else
2424#endif
2425 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2426 including sign. Decimal takes the most space. This
2427 isn't enough for octal. If a width is specified we
2428 need more (which we allocate later). */
2429 if (width < MAX_LONG_CHARS)
2430 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002431
2432 /* account for the size + '\0' to separate numbers
2433 inside of the numberresults buffer */
2434 numbersize += (width + 1);
2435 }
2436 }
2437 else if ((unsigned char)*f > 127) {
2438 PyErr_Format(PyExc_ValueError,
2439 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2440 "string, got a non-ASCII byte: 0x%02x",
2441 (unsigned char)*f);
2442 return NULL;
2443 }
2444 }
2445 /* step 2: allocate memory for the results of
2446 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2447 if (callcount) {
2448 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2449 if (!callresults) {
2450 PyErr_NoMemory();
2451 return NULL;
2452 }
2453 callresult = callresults;
2454 }
2455 /* step 2.5: allocate memory for the results of formating numbers */
2456 if (numbersize) {
2457 numberresults = PyObject_Malloc(numbersize);
2458 if (!numberresults) {
2459 PyErr_NoMemory();
2460 goto fail;
2461 }
2462 numberresult = numberresults;
2463 }
2464
2465 /* step 3: format numbers and figure out how large a buffer we need */
2466 for (f = format; *f; f++) {
2467 if (*f == '%') {
2468 const char* p;
2469 int longflag;
2470 int longlongflag;
2471 int size_tflag;
2472 int numprinted;
2473
2474 p = f;
2475 zeropad = (f[1] == '0');
2476 f = parse_format_flags(f, &width, &precision,
2477 &longflag, &longlongflag, &size_tflag);
2478 switch (*f) {
2479 case 'c':
2480 {
2481 Py_UCS4 ordinal = va_arg(count, int);
Victor Stinnere6abb482012-05-02 01:15:40 +02002482 maxchar = MAX_MAXCHAR(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002483 n++;
2484 break;
2485 }
2486 case '%':
2487 n++;
2488 break;
2489 case 'i':
2490 case 'd':
2491 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2492 width, precision, *f);
2493 if (longflag)
2494 numprinted = sprintf(numberresult, fmt,
2495 va_arg(count, long));
2496#ifdef HAVE_LONG_LONG
2497 else if (longlongflag)
2498 numprinted = sprintf(numberresult, fmt,
2499 va_arg(count, PY_LONG_LONG));
2500#endif
2501 else if (size_tflag)
2502 numprinted = sprintf(numberresult, fmt,
2503 va_arg(count, Py_ssize_t));
2504 else
2505 numprinted = sprintf(numberresult, fmt,
2506 va_arg(count, int));
2507 n += numprinted;
2508 /* advance by +1 to skip over the '\0' */
2509 numberresult += (numprinted + 1);
2510 assert(*(numberresult - 1) == '\0');
2511 assert(*(numberresult - 2) != '\0');
2512 assert(numprinted >= 0);
2513 assert(numberresult <= numberresults + numbersize);
2514 break;
2515 case 'u':
2516 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2517 width, precision, 'u');
2518 if (longflag)
2519 numprinted = sprintf(numberresult, fmt,
2520 va_arg(count, unsigned long));
2521#ifdef HAVE_LONG_LONG
2522 else if (longlongflag)
2523 numprinted = sprintf(numberresult, fmt,
2524 va_arg(count, unsigned PY_LONG_LONG));
2525#endif
2526 else if (size_tflag)
2527 numprinted = sprintf(numberresult, fmt,
2528 va_arg(count, size_t));
2529 else
2530 numprinted = sprintf(numberresult, fmt,
2531 va_arg(count, unsigned int));
2532 n += numprinted;
2533 numberresult += (numprinted + 1);
2534 assert(*(numberresult - 1) == '\0');
2535 assert(*(numberresult - 2) != '\0');
2536 assert(numprinted >= 0);
2537 assert(numberresult <= numberresults + numbersize);
2538 break;
2539 case 'x':
2540 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2541 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2542 n += numprinted;
2543 numberresult += (numprinted + 1);
2544 assert(*(numberresult - 1) == '\0');
2545 assert(*(numberresult - 2) != '\0');
2546 assert(numprinted >= 0);
2547 assert(numberresult <= numberresults + numbersize);
2548 break;
2549 case 'p':
2550 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2551 /* %p is ill-defined: ensure leading 0x. */
2552 if (numberresult[1] == 'X')
2553 numberresult[1] = 'x';
2554 else if (numberresult[1] != 'x') {
2555 memmove(numberresult + 2, numberresult,
2556 strlen(numberresult) + 1);
2557 numberresult[0] = '0';
2558 numberresult[1] = 'x';
2559 numprinted += 2;
2560 }
2561 n += numprinted;
2562 numberresult += (numprinted + 1);
2563 assert(*(numberresult - 1) == '\0');
2564 assert(*(numberresult - 2) != '\0');
2565 assert(numprinted >= 0);
2566 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002567 break;
2568 case 's':
2569 {
2570 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002571 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002572 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002573 if (!str)
2574 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002575 /* since PyUnicode_DecodeUTF8 returns already flexible
2576 unicode objects, there is no need to call ready on them */
2577 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002578 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002579 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002580 /* Remember the str and switch to the next slot */
2581 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 break;
2583 }
2584 case 'U':
2585 {
2586 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002587 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002588 if (PyUnicode_READY(obj) == -1)
2589 goto fail;
2590 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002591 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002592 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002593 break;
2594 }
2595 case 'V':
2596 {
2597 PyObject *obj = va_arg(count, PyObject *);
2598 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002599 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002600 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002601 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002602 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 if (PyUnicode_READY(obj) == -1)
2604 goto fail;
2605 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002606 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002608 *callresult++ = NULL;
2609 }
2610 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002611 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002612 if (!str_obj)
2613 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002614 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002615 Py_DECREF(str_obj);
2616 goto fail;
2617 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002618 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002619 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002621 *callresult++ = str_obj;
2622 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002623 break;
2624 }
2625 case 'S':
2626 {
2627 PyObject *obj = va_arg(count, PyObject *);
2628 PyObject *str;
2629 assert(obj);
2630 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002631 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002632 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002633 if (PyUnicode_READY(str) == -1) {
2634 Py_DECREF(str);
2635 goto fail;
2636 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002638 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002640 /* Remember the str and switch to the next slot */
2641 *callresult++ = str;
2642 break;
2643 }
2644 case 'R':
2645 {
2646 PyObject *obj = va_arg(count, PyObject *);
2647 PyObject *repr;
2648 assert(obj);
2649 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002650 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002651 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002652 if (PyUnicode_READY(repr) == -1) {
2653 Py_DECREF(repr);
2654 goto fail;
2655 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Victor Stinnere6abb482012-05-02 01:15:40 +02002657 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002659 /* Remember the repr and switch to the next slot */
2660 *callresult++ = repr;
2661 break;
2662 }
2663 case 'A':
2664 {
2665 PyObject *obj = va_arg(count, PyObject *);
2666 PyObject *ascii;
2667 assert(obj);
2668 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002669 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002670 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002671 if (PyUnicode_READY(ascii) == -1) {
2672 Py_DECREF(ascii);
2673 goto fail;
2674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002675 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Victor Stinnere6abb482012-05-02 01:15:40 +02002676 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002677 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002678 /* Remember the repr and switch to the next slot */
2679 *callresult++ = ascii;
2680 break;
2681 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002682 default:
2683 /* if we stumble upon an unknown
2684 formatting code, copy the rest of
2685 the format string to the output
2686 string. (we cannot just skip the
2687 code, since there's no way to know
2688 what's in the argument list) */
2689 n += strlen(p);
2690 goto expand;
2691 }
2692 } else
2693 n++;
2694 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002695 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002696 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002697 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002698 we don't have to resize the string.
2699 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002700 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002701 if (!string)
2702 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002703 kind = PyUnicode_KIND(string);
2704 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002705 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002706 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002708 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002709 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002710 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002711
2712 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002713 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2714 /* checking for == because the last argument could be a empty
2715 string, which causes i to point to end, the assert at the end of
2716 the loop */
2717 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002718
Benjamin Peterson14339b62009-01-31 16:36:08 +00002719 switch (*f) {
2720 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002721 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002722 const int ordinal = va_arg(vargs, int);
2723 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002724 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002725 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002726 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002727 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002728 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002729 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002730 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002731 {
2732 Py_ssize_t written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002733 /* unused, since we already have the result */
2734 if (*f == 'p')
2735 (void) va_arg(vargs, void *);
2736 else
2737 (void) va_arg(vargs, int);
2738 /* extract the result from numberresults and append. */
Victor Stinnerc5166102012-02-22 13:55:02 +01002739 written = unicode_write_cstr(string, i, numberresult);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002740 /* skip over the separating '\0' */
Victor Stinnerc5166102012-02-22 13:55:02 +01002741 i += written;
2742 numberresult += written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002743 assert(*numberresult == '\0');
2744 numberresult++;
2745 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002746 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002747 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002748 case 's':
2749 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002750 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002752 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002753 size = PyUnicode_GET_LENGTH(*callresult);
2754 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002755 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002756 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002757 /* We're done with the unicode()/repr() => forget it */
2758 Py_DECREF(*callresult);
2759 /* switch to next unicode()/repr() result */
2760 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002761 break;
2762 }
2763 case 'U':
2764 {
2765 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002766 Py_ssize_t size;
2767 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2768 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002769 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002770 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002771 break;
2772 }
2773 case 'V':
2774 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002776 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002777 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002778 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 size = PyUnicode_GET_LENGTH(obj);
2780 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002781 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002782 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002783 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002784 size = PyUnicode_GET_LENGTH(*callresult);
2785 assert(PyUnicode_KIND(*callresult) <=
2786 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002787 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002788 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002789 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002790 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002791 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002792 break;
2793 }
2794 case 'S':
2795 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002796 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002797 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002798 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002799 /* unused, since we already have the result */
2800 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002801 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002802 copy_characters(string, i, *callresult, 0, size);
2803 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002804 /* We're done with the unicode()/repr() => forget it */
2805 Py_DECREF(*callresult);
2806 /* switch to next unicode()/repr() result */
2807 ++callresult;
2808 break;
2809 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002810 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002811 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002812 break;
2813 default:
Victor Stinnerc5166102012-02-22 13:55:02 +01002814 i += unicode_write_cstr(string, i, p);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002815 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002816 goto end;
2817 }
Victor Stinner1205f272010-09-11 00:54:47 +00002818 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 else {
2820 assert(i < PyUnicode_GET_LENGTH(string));
2821 PyUnicode_WRITE(kind, data, i++, *f);
2822 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002823 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002824 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002825
Benjamin Peterson29060642009-01-31 22:14:21 +00002826 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002827 if (callresults)
2828 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002829 if (numberresults)
2830 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002831 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002833 if (callresults) {
2834 PyObject **callresult2 = callresults;
2835 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002836 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002837 ++callresult2;
2838 }
2839 PyObject_Free(callresults);
2840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002841 if (numberresults)
2842 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002843 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002844}
2845
Walter Dörwaldd2034312007-05-18 16:29:38 +00002846PyObject *
2847PyUnicode_FromFormat(const char *format, ...)
2848{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002849 PyObject* ret;
2850 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002851
2852#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002853 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002854#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002855 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002856#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002857 ret = PyUnicode_FromFormatV(format, vargs);
2858 va_end(vargs);
2859 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002860}
2861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002862#ifdef HAVE_WCHAR_H
2863
Victor Stinner5593d8a2010-10-02 11:11:27 +00002864/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2865 convert a Unicode object to a wide character string.
2866
Victor Stinnerd88d9832011-09-06 02:00:05 +02002867 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002868 character) required to convert the unicode object. Ignore size argument.
2869
Victor Stinnerd88d9832011-09-06 02:00:05 +02002870 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002871 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002872 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002873static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002874unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002875 wchar_t *w,
2876 Py_ssize_t size)
2877{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002878 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002879 const wchar_t *wstr;
2880
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002881 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002882 if (wstr == NULL)
2883 return -1;
2884
Victor Stinner5593d8a2010-10-02 11:11:27 +00002885 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002886 if (size > res)
2887 size = res + 1;
2888 else
2889 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002890 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002891 return res;
2892 }
2893 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002894 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002895}
2896
2897Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002898PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002899 wchar_t *w,
2900 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901{
2902 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002903 PyErr_BadInternalCall();
2904 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002906 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907}
2908
Victor Stinner137c34c2010-09-29 10:25:54 +00002909wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002910PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002911 Py_ssize_t *size)
2912{
2913 wchar_t* buffer;
2914 Py_ssize_t buflen;
2915
2916 if (unicode == NULL) {
2917 PyErr_BadInternalCall();
2918 return NULL;
2919 }
2920
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002921 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002922 if (buflen == -1)
2923 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002924 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002925 PyErr_NoMemory();
2926 return NULL;
2927 }
2928
Victor Stinner137c34c2010-09-29 10:25:54 +00002929 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2930 if (buffer == NULL) {
2931 PyErr_NoMemory();
2932 return NULL;
2933 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002934 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002935 if (buflen == -1)
2936 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002937 if (size != NULL)
2938 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002939 return buffer;
2940}
2941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002942#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002943
Alexander Belopolsky40018472011-02-26 01:02:56 +00002944PyObject *
2945PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002946{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002947 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002948 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002949 PyErr_SetString(PyExc_ValueError,
2950 "chr() arg not in range(0x110000)");
2951 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002952 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002954 if (ordinal < 256)
2955 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002957 v = PyUnicode_New(1, ordinal);
2958 if (v == NULL)
2959 return NULL;
2960 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002961 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002962 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002963}
2964
Alexander Belopolsky40018472011-02-26 01:02:56 +00002965PyObject *
2966PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002967{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002968 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002969 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002970 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002971 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002972 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002973 Py_INCREF(obj);
2974 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002975 }
2976 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002977 /* For a Unicode subtype that's not a Unicode object,
2978 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002979 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002980 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002981 PyErr_Format(PyExc_TypeError,
2982 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002983 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002984 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002985}
2986
Alexander Belopolsky40018472011-02-26 01:02:56 +00002987PyObject *
2988PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002989 const char *encoding,
2990 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002991{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002992 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002993 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002994
Guido van Rossumd57fd912000-03-10 22:53:23 +00002995 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002996 PyErr_BadInternalCall();
2997 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002999
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003000 /* Decoding bytes objects is the most common case and should be fast */
3001 if (PyBytes_Check(obj)) {
3002 if (PyBytes_GET_SIZE(obj) == 0) {
3003 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003004 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003005 }
3006 else {
3007 v = PyUnicode_Decode(
3008 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3009 encoding, errors);
3010 }
3011 return v;
3012 }
3013
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003014 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003015 PyErr_SetString(PyExc_TypeError,
3016 "decoding str is not supported");
3017 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003018 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003019
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003020 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3021 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3022 PyErr_Format(PyExc_TypeError,
3023 "coercing to str: need bytes, bytearray "
3024 "or buffer-like object, %.80s found",
3025 Py_TYPE(obj)->tp_name);
3026 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003027 }
Tim Petersced69f82003-09-16 20:30:58 +00003028
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003029 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003031 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032 }
Tim Petersced69f82003-09-16 20:30:58 +00003033 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003034 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003035
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003036 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003037 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003038}
3039
Victor Stinner600d3be2010-06-10 12:00:55 +00003040/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003041 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3042 1 on success. */
3043static int
3044normalize_encoding(const char *encoding,
3045 char *lower,
3046 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003048 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003049 char *l;
3050 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003051
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003052 if (encoding == NULL) {
3053 strcpy(lower, "utf-8");
3054 return 1;
3055 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003056 e = encoding;
3057 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003058 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003059 while (*e) {
3060 if (l == l_end)
3061 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003062 if (Py_ISUPPER(*e)) {
3063 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003064 }
3065 else if (*e == '_') {
3066 *l++ = '-';
3067 e++;
3068 }
3069 else {
3070 *l++ = *e++;
3071 }
3072 }
3073 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003074 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003075}
3076
Alexander Belopolsky40018472011-02-26 01:02:56 +00003077PyObject *
3078PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003079 Py_ssize_t size,
3080 const char *encoding,
3081 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003082{
3083 PyObject *buffer = NULL, *unicode;
3084 Py_buffer info;
3085 char lower[11]; /* Enough for any encoding shortcut */
3086
Fred Drakee4315f52000-05-09 19:53:39 +00003087 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003088 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003089 if ((strcmp(lower, "utf-8") == 0) ||
3090 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003091 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003092 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003093 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003094 (strcmp(lower, "iso-8859-1") == 0))
3095 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003096#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003097 else if (strcmp(lower, "mbcs") == 0)
3098 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003099#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003100 else if (strcmp(lower, "ascii") == 0)
3101 return PyUnicode_DecodeASCII(s, size, errors);
3102 else if (strcmp(lower, "utf-16") == 0)
3103 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3104 else if (strcmp(lower, "utf-32") == 0)
3105 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107
3108 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003109 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003110 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003111 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003112 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113 if (buffer == NULL)
3114 goto onError;
3115 unicode = PyCodec_Decode(buffer, encoding, errors);
3116 if (unicode == NULL)
3117 goto onError;
3118 if (!PyUnicode_Check(unicode)) {
3119 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003120 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003121 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122 Py_DECREF(unicode);
3123 goto onError;
3124 }
3125 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003126 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003127
Benjamin Peterson29060642009-01-31 22:14:21 +00003128 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003129 Py_XDECREF(buffer);
3130 return NULL;
3131}
3132
Alexander Belopolsky40018472011-02-26 01:02:56 +00003133PyObject *
3134PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003135 const char *encoding,
3136 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003137{
3138 PyObject *v;
3139
3140 if (!PyUnicode_Check(unicode)) {
3141 PyErr_BadArgument();
3142 goto onError;
3143 }
3144
3145 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003146 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003147
3148 /* Decode via the codec registry */
3149 v = PyCodec_Decode(unicode, encoding, errors);
3150 if (v == NULL)
3151 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003152 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003153
Benjamin Peterson29060642009-01-31 22:14:21 +00003154 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003155 return NULL;
3156}
3157
Alexander Belopolsky40018472011-02-26 01:02:56 +00003158PyObject *
3159PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003160 const char *encoding,
3161 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003162{
3163 PyObject *v;
3164
3165 if (!PyUnicode_Check(unicode)) {
3166 PyErr_BadArgument();
3167 goto onError;
3168 }
3169
3170 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003171 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003172
3173 /* Decode via the codec registry */
3174 v = PyCodec_Decode(unicode, encoding, errors);
3175 if (v == NULL)
3176 goto onError;
3177 if (!PyUnicode_Check(v)) {
3178 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003179 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003180 Py_TYPE(v)->tp_name);
3181 Py_DECREF(v);
3182 goto onError;
3183 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003184 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003185
Benjamin Peterson29060642009-01-31 22:14:21 +00003186 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003187 return NULL;
3188}
3189
Alexander Belopolsky40018472011-02-26 01:02:56 +00003190PyObject *
3191PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003192 Py_ssize_t size,
3193 const char *encoding,
3194 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195{
3196 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003197
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 unicode = PyUnicode_FromUnicode(s, size);
3199 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003200 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003201 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3202 Py_DECREF(unicode);
3203 return v;
3204}
3205
Alexander Belopolsky40018472011-02-26 01:02:56 +00003206PyObject *
3207PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003208 const char *encoding,
3209 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003210{
3211 PyObject *v;
3212
3213 if (!PyUnicode_Check(unicode)) {
3214 PyErr_BadArgument();
3215 goto onError;
3216 }
3217
3218 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003219 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003220
3221 /* Encode via the codec registry */
3222 v = PyCodec_Encode(unicode, encoding, errors);
3223 if (v == NULL)
3224 goto onError;
3225 return v;
3226
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003228 return NULL;
3229}
3230
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003231static size_t
3232wcstombs_errorpos(const wchar_t *wstr)
3233{
3234 size_t len;
3235#if SIZEOF_WCHAR_T == 2
3236 wchar_t buf[3];
3237#else
3238 wchar_t buf[2];
3239#endif
3240 char outbuf[MB_LEN_MAX];
3241 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003242
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003243#if SIZEOF_WCHAR_T == 2
3244 buf[2] = 0;
3245#else
3246 buf[1] = 0;
3247#endif
3248 start = wstr;
3249 while (*wstr != L'\0')
3250 {
3251 previous = wstr;
3252#if SIZEOF_WCHAR_T == 2
3253 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3254 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3255 {
3256 buf[0] = wstr[0];
3257 buf[1] = wstr[1];
3258 wstr += 2;
3259 }
3260 else {
3261 buf[0] = *wstr;
3262 buf[1] = 0;
3263 wstr++;
3264 }
3265#else
3266 buf[0] = *wstr;
3267 wstr++;
3268#endif
3269 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003270 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003271 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003272 }
3273
3274 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003275 return 0;
3276}
3277
Victor Stinner1b579672011-12-17 05:47:23 +01003278static int
3279locale_error_handler(const char *errors, int *surrogateescape)
3280{
3281 if (errors == NULL) {
3282 *surrogateescape = 0;
3283 return 0;
3284 }
3285
3286 if (strcmp(errors, "strict") == 0) {
3287 *surrogateescape = 0;
3288 return 0;
3289 }
3290 if (strcmp(errors, "surrogateescape") == 0) {
3291 *surrogateescape = 1;
3292 return 0;
3293 }
3294 PyErr_Format(PyExc_ValueError,
3295 "only 'strict' and 'surrogateescape' error handlers "
3296 "are supported, not '%s'",
3297 errors);
3298 return -1;
3299}
3300
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003301PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003302PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003303{
3304 Py_ssize_t wlen, wlen2;
3305 wchar_t *wstr;
3306 PyObject *bytes = NULL;
3307 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003308 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003309 PyObject *exc;
3310 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003311 int surrogateescape;
3312
3313 if (locale_error_handler(errors, &surrogateescape) < 0)
3314 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003315
3316 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3317 if (wstr == NULL)
3318 return NULL;
3319
3320 wlen2 = wcslen(wstr);
3321 if (wlen2 != wlen) {
3322 PyMem_Free(wstr);
3323 PyErr_SetString(PyExc_TypeError, "embedded null character");
3324 return NULL;
3325 }
3326
3327 if (surrogateescape) {
3328 /* locale encoding with surrogateescape */
3329 char *str;
3330
3331 str = _Py_wchar2char(wstr, &error_pos);
3332 if (str == NULL) {
3333 if (error_pos == (size_t)-1) {
3334 PyErr_NoMemory();
3335 PyMem_Free(wstr);
3336 return NULL;
3337 }
3338 else {
3339 goto encode_error;
3340 }
3341 }
3342 PyMem_Free(wstr);
3343
3344 bytes = PyBytes_FromString(str);
3345 PyMem_Free(str);
3346 }
3347 else {
3348 size_t len, len2;
3349
3350 len = wcstombs(NULL, wstr, 0);
3351 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003352 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003353 goto encode_error;
3354 }
3355
3356 bytes = PyBytes_FromStringAndSize(NULL, len);
3357 if (bytes == NULL) {
3358 PyMem_Free(wstr);
3359 return NULL;
3360 }
3361
3362 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3363 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003364 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003365 goto encode_error;
3366 }
3367 PyMem_Free(wstr);
3368 }
3369 return bytes;
3370
3371encode_error:
3372 errmsg = strerror(errno);
3373 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003374
3375 if (error_pos == (size_t)-1)
3376 error_pos = wcstombs_errorpos(wstr);
3377
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003378 PyMem_Free(wstr);
3379 Py_XDECREF(bytes);
3380
Victor Stinner2f197072011-12-17 07:08:30 +01003381 if (errmsg != NULL) {
3382 size_t errlen;
3383 wstr = _Py_char2wchar(errmsg, &errlen);
3384 if (wstr != NULL) {
3385 reason = PyUnicode_FromWideChar(wstr, errlen);
3386 PyMem_Free(wstr);
3387 } else
3388 errmsg = NULL;
3389 }
3390 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003391 reason = PyUnicode_FromString(
3392 "wcstombs() encountered an unencodable "
3393 "wide character");
3394 if (reason == NULL)
3395 return NULL;
3396
3397 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3398 "locale", unicode,
3399 (Py_ssize_t)error_pos,
3400 (Py_ssize_t)(error_pos+1),
3401 reason);
3402 Py_DECREF(reason);
3403 if (exc != NULL) {
3404 PyCodec_StrictErrors(exc);
3405 Py_XDECREF(exc);
3406 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003407 return NULL;
3408}
3409
Victor Stinnerad158722010-10-27 00:25:46 +00003410PyObject *
3411PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003412{
Victor Stinner99b95382011-07-04 14:23:54 +02003413#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003414 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003415#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003416 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003417#else
Victor Stinner793b5312011-04-27 00:24:21 +02003418 PyInterpreterState *interp = PyThreadState_GET()->interp;
3419 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3420 cannot use it to encode and decode filenames before it is loaded. Load
3421 the Python codec requires to encode at least its own filename. Use the C
3422 version of the locale codec until the codec registry is initialized and
3423 the Python codec is loaded.
3424
3425 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3426 cannot only rely on it: check also interp->fscodec_initialized for
3427 subinterpreters. */
3428 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003429 return PyUnicode_AsEncodedString(unicode,
3430 Py_FileSystemDefaultEncoding,
3431 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003432 }
3433 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003434 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003435 }
Victor Stinnerad158722010-10-27 00:25:46 +00003436#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003437}
3438
Alexander Belopolsky40018472011-02-26 01:02:56 +00003439PyObject *
3440PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003441 const char *encoding,
3442 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443{
3444 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003445 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003446
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 if (!PyUnicode_Check(unicode)) {
3448 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003450 }
Fred Drakee4315f52000-05-09 19:53:39 +00003451
Fred Drakee4315f52000-05-09 19:53:39 +00003452 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003453 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003454 if ((strcmp(lower, "utf-8") == 0) ||
3455 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003456 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003457 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003458 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003459 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003460 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003461 }
Victor Stinner37296e82010-06-10 13:36:23 +00003462 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003463 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003464 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003465 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003466#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003467 else if (strcmp(lower, "mbcs") == 0)
3468 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003469#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003470 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003471 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003472 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003473
3474 /* Encode via the codec registry */
3475 v = PyCodec_Encode(unicode, encoding, errors);
3476 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003477 return NULL;
3478
3479 /* The normal path */
3480 if (PyBytes_Check(v))
3481 return v;
3482
3483 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003484 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003485 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003486 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003487
3488 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3489 "encoder %s returned bytearray instead of bytes",
3490 encoding);
3491 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003492 Py_DECREF(v);
3493 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003494 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003495
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003496 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3497 Py_DECREF(v);
3498 return b;
3499 }
3500
3501 PyErr_Format(PyExc_TypeError,
3502 "encoder did not return a bytes object (type=%.400s)",
3503 Py_TYPE(v)->tp_name);
3504 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003505 return NULL;
3506}
3507
Alexander Belopolsky40018472011-02-26 01:02:56 +00003508PyObject *
3509PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003510 const char *encoding,
3511 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003512{
3513 PyObject *v;
3514
3515 if (!PyUnicode_Check(unicode)) {
3516 PyErr_BadArgument();
3517 goto onError;
3518 }
3519
3520 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003521 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003522
3523 /* Encode via the codec registry */
3524 v = PyCodec_Encode(unicode, encoding, errors);
3525 if (v == NULL)
3526 goto onError;
3527 if (!PyUnicode_Check(v)) {
3528 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003529 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003530 Py_TYPE(v)->tp_name);
3531 Py_DECREF(v);
3532 goto onError;
3533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003535
Benjamin Peterson29060642009-01-31 22:14:21 +00003536 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537 return NULL;
3538}
3539
Victor Stinner2f197072011-12-17 07:08:30 +01003540static size_t
3541mbstowcs_errorpos(const char *str, size_t len)
3542{
3543#ifdef HAVE_MBRTOWC
3544 const char *start = str;
3545 mbstate_t mbs;
3546 size_t converted;
3547 wchar_t ch;
3548
3549 memset(&mbs, 0, sizeof mbs);
3550 while (len)
3551 {
3552 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3553 if (converted == 0)
3554 /* Reached end of string */
3555 break;
3556 if (converted == (size_t)-1 || converted == (size_t)-2) {
3557 /* Conversion error or incomplete character */
3558 return str - start;
3559 }
3560 else {
3561 str += converted;
3562 len -= converted;
3563 }
3564 }
3565 /* failed to find the undecodable byte sequence */
3566 return 0;
3567#endif
3568 return 0;
3569}
3570
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003571PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003572PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003573 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003574{
3575 wchar_t smallbuf[256];
3576 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3577 wchar_t *wstr;
3578 size_t wlen, wlen2;
3579 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003580 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003581 size_t error_pos;
3582 char *errmsg;
3583 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003584
3585 if (locale_error_handler(errors, &surrogateescape) < 0)
3586 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003587
3588 if (str[len] != '\0' || len != strlen(str)) {
3589 PyErr_SetString(PyExc_TypeError, "embedded null character");
3590 return NULL;
3591 }
3592
3593 if (surrogateescape)
3594 {
3595 wstr = _Py_char2wchar(str, &wlen);
3596 if (wstr == NULL) {
3597 if (wlen == (size_t)-1)
3598 PyErr_NoMemory();
3599 else
3600 PyErr_SetFromErrno(PyExc_OSError);
3601 return NULL;
3602 }
3603
3604 unicode = PyUnicode_FromWideChar(wstr, wlen);
3605 PyMem_Free(wstr);
3606 }
3607 else {
3608#ifndef HAVE_BROKEN_MBSTOWCS
3609 wlen = mbstowcs(NULL, str, 0);
3610#else
3611 wlen = len;
3612#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003613 if (wlen == (size_t)-1)
3614 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003615 if (wlen+1 <= smallbuf_len) {
3616 wstr = smallbuf;
3617 }
3618 else {
3619 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3620 return PyErr_NoMemory();
3621
3622 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3623 if (!wstr)
3624 return PyErr_NoMemory();
3625 }
3626
3627 /* This shouldn't fail now */
3628 wlen2 = mbstowcs(wstr, str, wlen+1);
3629 if (wlen2 == (size_t)-1) {
3630 if (wstr != smallbuf)
3631 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003632 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003633 }
3634#ifdef HAVE_BROKEN_MBSTOWCS
3635 assert(wlen2 == wlen);
3636#endif
3637 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3638 if (wstr != smallbuf)
3639 PyMem_Free(wstr);
3640 }
3641 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003642
3643decode_error:
3644 errmsg = strerror(errno);
3645 assert(errmsg != NULL);
3646
3647 error_pos = mbstowcs_errorpos(str, len);
3648 if (errmsg != NULL) {
3649 size_t errlen;
3650 wstr = _Py_char2wchar(errmsg, &errlen);
3651 if (wstr != NULL) {
3652 reason = PyUnicode_FromWideChar(wstr, errlen);
3653 PyMem_Free(wstr);
3654 } else
3655 errmsg = NULL;
3656 }
3657 if (errmsg == NULL)
3658 reason = PyUnicode_FromString(
3659 "mbstowcs() encountered an invalid multibyte sequence");
3660 if (reason == NULL)
3661 return NULL;
3662
3663 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3664 "locale", str, len,
3665 (Py_ssize_t)error_pos,
3666 (Py_ssize_t)(error_pos+1),
3667 reason);
3668 Py_DECREF(reason);
3669 if (exc != NULL) {
3670 PyCodec_StrictErrors(exc);
3671 Py_XDECREF(exc);
3672 }
3673 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003674}
3675
3676PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003677PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003678{
3679 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003680 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003681}
3682
3683
3684PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003685PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003686 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003687 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3688}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003689
Christian Heimes5894ba72007-11-04 11:43:14 +00003690PyObject*
3691PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3692{
Victor Stinner99b95382011-07-04 14:23:54 +02003693#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003694 return PyUnicode_DecodeMBCS(s, size, NULL);
3695#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003696 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003697#else
Victor Stinner793b5312011-04-27 00:24:21 +02003698 PyInterpreterState *interp = PyThreadState_GET()->interp;
3699 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3700 cannot use it to encode and decode filenames before it is loaded. Load
3701 the Python codec requires to encode at least its own filename. Use the C
3702 version of the locale codec until the codec registry is initialized and
3703 the Python codec is loaded.
3704
3705 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3706 cannot only rely on it: check also interp->fscodec_initialized for
3707 subinterpreters. */
3708 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003709 return PyUnicode_Decode(s, size,
3710 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003711 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003712 }
3713 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003714 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003715 }
Victor Stinnerad158722010-10-27 00:25:46 +00003716#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003717}
3718
Martin v. Löwis011e8422009-05-05 04:43:17 +00003719
3720int
Antoine Pitrou13348842012-01-29 18:36:34 +01003721_PyUnicode_HasNULChars(PyObject* s)
3722{
3723 static PyObject *nul = NULL;
3724
3725 if (nul == NULL)
3726 nul = PyUnicode_FromStringAndSize("\0", 1);
3727 if (nul == NULL)
3728 return -1;
3729 return PyUnicode_Contains(s, nul);
3730}
3731
3732
3733int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003734PyUnicode_FSConverter(PyObject* arg, void* addr)
3735{
3736 PyObject *output = NULL;
3737 Py_ssize_t size;
3738 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003739 if (arg == NULL) {
3740 Py_DECREF(*(PyObject**)addr);
3741 return 1;
3742 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003743 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003744 output = arg;
3745 Py_INCREF(output);
3746 }
3747 else {
3748 arg = PyUnicode_FromObject(arg);
3749 if (!arg)
3750 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003751 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003752 Py_DECREF(arg);
3753 if (!output)
3754 return 0;
3755 if (!PyBytes_Check(output)) {
3756 Py_DECREF(output);
3757 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3758 return 0;
3759 }
3760 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003761 size = PyBytes_GET_SIZE(output);
3762 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003763 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003764 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003765 Py_DECREF(output);
3766 return 0;
3767 }
3768 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003769 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003770}
3771
3772
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003773int
3774PyUnicode_FSDecoder(PyObject* arg, void* addr)
3775{
3776 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003777 if (arg == NULL) {
3778 Py_DECREF(*(PyObject**)addr);
3779 return 1;
3780 }
3781 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003782 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003783 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003784 output = arg;
3785 Py_INCREF(output);
3786 }
3787 else {
3788 arg = PyBytes_FromObject(arg);
3789 if (!arg)
3790 return 0;
3791 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3792 PyBytes_GET_SIZE(arg));
3793 Py_DECREF(arg);
3794 if (!output)
3795 return 0;
3796 if (!PyUnicode_Check(output)) {
3797 Py_DECREF(output);
3798 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3799 return 0;
3800 }
3801 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003802 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003803 Py_DECREF(output);
3804 return 0;
3805 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003806 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003807 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003808 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3809 Py_DECREF(output);
3810 return 0;
3811 }
3812 *(PyObject**)addr = output;
3813 return Py_CLEANUP_SUPPORTED;
3814}
3815
3816
Martin v. Löwis5b222132007-06-10 09:51:05 +00003817char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003818PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003819{
Christian Heimesf3863112007-11-22 07:46:41 +00003820 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003821
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003822 if (!PyUnicode_Check(unicode)) {
3823 PyErr_BadArgument();
3824 return NULL;
3825 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003826 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003827 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003828
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003829 if (PyUnicode_UTF8(unicode) == NULL) {
3830 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003831 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3832 if (bytes == NULL)
3833 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003834 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3835 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836 Py_DECREF(bytes);
3837 return NULL;
3838 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003839 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3840 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3841 PyBytes_AS_STRING(bytes),
3842 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003843 Py_DECREF(bytes);
3844 }
3845
3846 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003847 *psize = PyUnicode_UTF8_LENGTH(unicode);
3848 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003849}
3850
3851char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003852PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003853{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003854 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3855}
3856
3857#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003858static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859#endif
3860
3861
3862Py_UNICODE *
3863PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3864{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003865 const unsigned char *one_byte;
3866#if SIZEOF_WCHAR_T == 4
3867 const Py_UCS2 *two_bytes;
3868#else
3869 const Py_UCS4 *four_bytes;
3870 const Py_UCS4 *ucs4_end;
3871 Py_ssize_t num_surrogates;
3872#endif
3873 wchar_t *w;
3874 wchar_t *wchar_end;
3875
3876 if (!PyUnicode_Check(unicode)) {
3877 PyErr_BadArgument();
3878 return NULL;
3879 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003880 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003882 assert(_PyUnicode_KIND(unicode) != 0);
3883 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884
3885#ifdef Py_DEBUG
3886 ++unicode_as_unicode_calls;
3887#endif
3888
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003889 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003891 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3892 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893 num_surrogates = 0;
3894
3895 for (; four_bytes < ucs4_end; ++four_bytes) {
3896 if (*four_bytes > 0xFFFF)
3897 ++num_surrogates;
3898 }
3899
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003900 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3901 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3902 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003903 PyErr_NoMemory();
3904 return NULL;
3905 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003906 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003908 w = _PyUnicode_WSTR(unicode);
3909 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3910 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003911 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3912 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003913 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003914 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003915 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3916 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003917 }
3918 else
3919 *w = *four_bytes;
3920
3921 if (w > wchar_end) {
3922 assert(0 && "Miscalculated string end");
3923 }
3924 }
3925 *w = 0;
3926#else
3927 /* sizeof(wchar_t) == 4 */
3928 Py_FatalError("Impossible unicode object state, wstr and str "
3929 "should share memory already.");
3930 return NULL;
3931#endif
3932 }
3933 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003934 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3935 (_PyUnicode_LENGTH(unicode) + 1));
3936 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003937 PyErr_NoMemory();
3938 return NULL;
3939 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003940 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3941 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3942 w = _PyUnicode_WSTR(unicode);
3943 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003944
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003945 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3946 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003947 for (; w < wchar_end; ++one_byte, ++w)
3948 *w = *one_byte;
3949 /* null-terminate the wstr */
3950 *w = 0;
3951 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003952 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003953#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003954 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003955 for (; w < wchar_end; ++two_bytes, ++w)
3956 *w = *two_bytes;
3957 /* null-terminate the wstr */
3958 *w = 0;
3959#else
3960 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003961 PyObject_FREE(_PyUnicode_WSTR(unicode));
3962 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003963 Py_FatalError("Impossible unicode object state, wstr "
3964 "and str should share memory already.");
3965 return NULL;
3966#endif
3967 }
3968 else {
3969 assert(0 && "This should never happen.");
3970 }
3971 }
3972 }
3973 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003974 *size = PyUnicode_WSTR_LENGTH(unicode);
3975 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003976}
3977
Alexander Belopolsky40018472011-02-26 01:02:56 +00003978Py_UNICODE *
3979PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982}
3983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003984
Alexander Belopolsky40018472011-02-26 01:02:56 +00003985Py_ssize_t
3986PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987{
3988 if (!PyUnicode_Check(unicode)) {
3989 PyErr_BadArgument();
3990 goto onError;
3991 }
3992 return PyUnicode_GET_SIZE(unicode);
3993
Benjamin Peterson29060642009-01-31 22:14:21 +00003994 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 return -1;
3996}
3997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998Py_ssize_t
3999PyUnicode_GetLength(PyObject *unicode)
4000{
Victor Stinner5a706cf2011-10-02 00:36:53 +02004001 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004002 PyErr_BadArgument();
4003 return -1;
4004 }
4005
4006 return PyUnicode_GET_LENGTH(unicode);
4007}
4008
4009Py_UCS4
4010PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4011{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004012 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4013 PyErr_BadArgument();
4014 return (Py_UCS4)-1;
4015 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004016 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004017 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 return (Py_UCS4)-1;
4019 }
4020 return PyUnicode_READ_CHAR(unicode, index);
4021}
4022
4023int
4024PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4025{
4026 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004027 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004028 return -1;
4029 }
Victor Stinner488fa492011-12-12 00:01:39 +01004030 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004031 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004032 PyErr_SetString(PyExc_IndexError, "string index out of range");
4033 return -1;
4034 }
Victor Stinner488fa492011-12-12 00:01:39 +01004035 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004036 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004037 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4038 PyErr_SetString(PyExc_ValueError, "character out of range");
4039 return -1;
4040 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4042 index, ch);
4043 return 0;
4044}
4045
Alexander Belopolsky40018472011-02-26 01:02:56 +00004046const char *
4047PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004048{
Victor Stinner42cb4622010-09-01 19:39:01 +00004049 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004050}
4051
Victor Stinner554f3f02010-06-16 23:33:54 +00004052/* create or adjust a UnicodeDecodeError */
4053static void
4054make_decode_exception(PyObject **exceptionObject,
4055 const char *encoding,
4056 const char *input, Py_ssize_t length,
4057 Py_ssize_t startpos, Py_ssize_t endpos,
4058 const char *reason)
4059{
4060 if (*exceptionObject == NULL) {
4061 *exceptionObject = PyUnicodeDecodeError_Create(
4062 encoding, input, length, startpos, endpos, reason);
4063 }
4064 else {
4065 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4066 goto onError;
4067 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4068 goto onError;
4069 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4070 goto onError;
4071 }
4072 return;
4073
4074onError:
4075 Py_DECREF(*exceptionObject);
4076 *exceptionObject = NULL;
4077}
4078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079/* error handling callback helper:
4080 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004081 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004082 and adjust various state variables.
4083 return 0 on success, -1 on error
4084*/
4085
Alexander Belopolsky40018472011-02-26 01:02:56 +00004086static int
4087unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004088 const char *encoding, const char *reason,
4089 const char **input, const char **inend, Py_ssize_t *startinpos,
4090 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004091 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004092{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004093 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004094
4095 PyObject *restuple = NULL;
4096 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004097 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004098 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004099 Py_ssize_t requiredsize;
4100 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004101 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004102 int res = -1;
4103
Victor Stinner596a6c42011-11-09 00:02:18 +01004104 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4105 outsize = PyUnicode_GET_LENGTH(*output);
4106 else
4107 outsize = _PyUnicode_WSTR_LENGTH(*output);
4108
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004109 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004110 *errorHandler = PyCodec_LookupError(errors);
4111 if (*errorHandler == NULL)
4112 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004113 }
4114
Victor Stinner554f3f02010-06-16 23:33:54 +00004115 make_decode_exception(exceptionObject,
4116 encoding,
4117 *input, *inend - *input,
4118 *startinpos, *endinpos,
4119 reason);
4120 if (*exceptionObject == NULL)
4121 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004122
4123 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4124 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004125 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004127 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004128 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004129 }
4130 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004131 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004132 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004133 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004134
4135 /* Copy back the bytes variables, which might have been modified by the
4136 callback */
4137 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4138 if (!inputobj)
4139 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004140 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004141 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004142 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004143 *input = PyBytes_AS_STRING(inputobj);
4144 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004145 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004146 /* we can DECREF safely, as the exception has another reference,
4147 so the object won't go away. */
4148 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004149
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004150 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004151 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004152 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004153 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4154 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004155 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156
Victor Stinner596a6c42011-11-09 00:02:18 +01004157 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4158 /* need more space? (at least enough for what we
4159 have+the replacement+the rest of the string (starting
4160 at the new input position), so we won't have to check space
4161 when there are no errors in the rest of the string) */
4162 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4163 requiredsize = *outpos + replen + insize-newpos;
4164 if (requiredsize > outsize) {
4165 if (requiredsize<2*outsize)
4166 requiredsize = 2*outsize;
4167 if (unicode_resize(output, requiredsize) < 0)
4168 goto onError;
4169 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004170 if (unicode_widen(output, *outpos,
4171 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004172 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004173 copy_characters(*output, *outpos, repunicode, 0, replen);
4174 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004175 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004176 else {
4177 wchar_t *repwstr;
4178 Py_ssize_t repwlen;
4179 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4180 if (repwstr == NULL)
4181 goto onError;
4182 /* need more space? (at least enough for what we
4183 have+the replacement+the rest of the string (starting
4184 at the new input position), so we won't have to check space
4185 when there are no errors in the rest of the string) */
4186 requiredsize = *outpos + repwlen + insize-newpos;
4187 if (requiredsize > outsize) {
4188 if (requiredsize < 2*outsize)
4189 requiredsize = 2*outsize;
4190 if (unicode_resize(output, requiredsize) < 0)
4191 goto onError;
4192 }
4193 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4194 *outpos += repwlen;
4195 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004197 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004198
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199 /* we made it! */
4200 res = 0;
4201
Benjamin Peterson29060642009-01-31 22:14:21 +00004202 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004203 Py_XDECREF(restuple);
4204 return res;
4205}
4206
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004207/* --- UTF-7 Codec -------------------------------------------------------- */
4208
Antoine Pitrou244651a2009-05-04 18:56:13 +00004209/* See RFC2152 for details. We encode conservatively and decode liberally. */
4210
4211/* Three simple macros defining base-64. */
4212
4213/* Is c a base-64 character? */
4214
4215#define IS_BASE64(c) \
4216 (((c) >= 'A' && (c) <= 'Z') || \
4217 ((c) >= 'a' && (c) <= 'z') || \
4218 ((c) >= '0' && (c) <= '9') || \
4219 (c) == '+' || (c) == '/')
4220
4221/* given that c is a base-64 character, what is its base-64 value? */
4222
4223#define FROM_BASE64(c) \
4224 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4225 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4226 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4227 (c) == '+' ? 62 : 63)
4228
4229/* What is the base-64 character of the bottom 6 bits of n? */
4230
4231#define TO_BASE64(n) \
4232 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4233
4234/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4235 * decoded as itself. We are permissive on decoding; the only ASCII
4236 * byte not decoding to itself is the + which begins a base64
4237 * string. */
4238
4239#define DECODE_DIRECT(c) \
4240 ((c) <= 127 && (c) != '+')
4241
4242/* The UTF-7 encoder treats ASCII characters differently according to
4243 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4244 * the above). See RFC2152. This array identifies these different
4245 * sets:
4246 * 0 : "Set D"
4247 * alphanumeric and '(),-./:?
4248 * 1 : "Set O"
4249 * !"#$%&*;<=>@[]^_`{|}
4250 * 2 : "whitespace"
4251 * ht nl cr sp
4252 * 3 : special (must be base64 encoded)
4253 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4254 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004255
Tim Petersced69f82003-09-16 20:30:58 +00004256static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004257char utf7_category[128] = {
4258/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4259 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4260/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4261 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4262/* sp ! " # $ % & ' ( ) * + , - . / */
4263 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4264/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4265 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4266/* @ A B C D E F G H I J K L M N O */
4267 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4268/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4269 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4270/* ` a b c d e f g h i j k l m n o */
4271 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4272/* p q r s t u v w x y z { | } ~ del */
4273 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004274};
4275
Antoine Pitrou244651a2009-05-04 18:56:13 +00004276/* ENCODE_DIRECT: this character should be encoded as itself. The
4277 * answer depends on whether we are encoding set O as itself, and also
4278 * on whether we are encoding whitespace as itself. RFC2152 makes it
4279 * clear that the answers to these questions vary between
4280 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004281
Antoine Pitrou244651a2009-05-04 18:56:13 +00004282#define ENCODE_DIRECT(c, directO, directWS) \
4283 ((c) < 128 && (c) > 0 && \
4284 ((utf7_category[(c)] == 0) || \
4285 (directWS && (utf7_category[(c)] == 2)) || \
4286 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004287
Alexander Belopolsky40018472011-02-26 01:02:56 +00004288PyObject *
4289PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004290 Py_ssize_t size,
4291 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004292{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004293 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4294}
4295
Antoine Pitrou244651a2009-05-04 18:56:13 +00004296/* The decoder. The only state we preserve is our read position,
4297 * i.e. how many characters we have consumed. So if we end in the
4298 * middle of a shift sequence we have to back off the read position
4299 * and the output to the beginning of the sequence, otherwise we lose
4300 * all the shift state (seen bits, number of bits seen, high
4301 * surrogate). */
4302
Alexander Belopolsky40018472011-02-26 01:02:56 +00004303PyObject *
4304PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004305 Py_ssize_t size,
4306 const char *errors,
4307 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004308{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004309 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004310 Py_ssize_t startinpos;
4311 Py_ssize_t endinpos;
4312 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004313 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004314 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004315 const char *errmsg = "";
4316 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004317 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004318 unsigned int base64bits = 0;
4319 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004320 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004321 PyObject *errorHandler = NULL;
4322 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004324 /* Start off assuming it's all ASCII. Widen later as necessary. */
4325 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004326 if (!unicode)
4327 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004328 if (size == 0) {
4329 if (consumed)
4330 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004331 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004332 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004333
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004334 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004335 e = s + size;
4336
4337 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004338 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004339 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004340 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004341
Antoine Pitrou244651a2009-05-04 18:56:13 +00004342 if (inShift) { /* in a base-64 section */
4343 if (IS_BASE64(ch)) { /* consume a base-64 character */
4344 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4345 base64bits += 6;
4346 s++;
4347 if (base64bits >= 16) {
4348 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004349 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004350 base64bits -= 16;
4351 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4352 if (surrogate) {
4353 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004354 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4355 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004356 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4357 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004358 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004359 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004360 }
4361 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004362 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4363 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004364 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365 }
4366 }
Victor Stinner551ac952011-11-29 22:58:13 +01004367 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 /* first surrogate */
4369 surrogate = outCh;
4370 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004371 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004372 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4373 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 }
4375 }
4376 }
4377 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004378 inShift = 0;
4379 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004380 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004381 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4382 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004383 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004384 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 if (base64bits > 0) { /* left-over bits */
4386 if (base64bits >= 6) {
4387 /* We've seen at least one base-64 character */
4388 errmsg = "partial character in shift sequence";
4389 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004390 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004391 else {
4392 /* Some bits remain; they should be zero */
4393 if (base64buffer != 0) {
4394 errmsg = "non-zero padding bits in shift sequence";
4395 goto utf7Error;
4396 }
4397 }
4398 }
4399 if (ch != '-') {
4400 /* '-' is absorbed; other terminating
4401 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004402 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4403 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004404 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405 }
4406 }
4407 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004409 s++; /* consume '+' */
4410 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004411 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004412 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4413 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004414 }
4415 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004417 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004419 }
4420 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004422 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4423 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004424 s++;
4425 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426 else {
4427 startinpos = s-starts;
4428 s++;
4429 errmsg = "unexpected special character";
4430 goto utf7Error;
4431 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004432 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004433utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434 endinpos = s-starts;
4435 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 errors, &errorHandler,
4437 "utf7", errmsg,
4438 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004439 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004440 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004441 }
4442
Antoine Pitrou244651a2009-05-04 18:56:13 +00004443 /* end of string */
4444
4445 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4446 /* if we're in an inconsistent state, that's an error */
4447 if (surrogate ||
4448 (base64bits >= 6) ||
4449 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004450 endinpos = size;
4451 if (unicode_decode_call_errorhandler(
4452 errors, &errorHandler,
4453 "utf7", "unterminated shift sequence",
4454 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004455 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456 goto onError;
4457 if (s < e)
4458 goto restart;
4459 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004460 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461
4462 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004463 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004464 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004465 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004466 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467 }
4468 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004469 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004471 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004473 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004474 goto onError;
4475
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476 Py_XDECREF(errorHandler);
4477 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004478 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004479
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 Py_XDECREF(errorHandler);
4482 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004483 Py_DECREF(unicode);
4484 return NULL;
4485}
4486
4487
Alexander Belopolsky40018472011-02-26 01:02:56 +00004488PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004489_PyUnicode_EncodeUTF7(PyObject *str,
4490 int base64SetO,
4491 int base64WhiteSpace,
4492 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004494 int kind;
4495 void *data;
4496 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004497 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004498 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004499 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004500 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501 unsigned int base64bits = 0;
4502 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004503 char * out;
4504 char * start;
4505
Benjamin Petersonbac79492012-01-14 13:34:47 -05004506 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004507 return NULL;
4508 kind = PyUnicode_KIND(str);
4509 data = PyUnicode_DATA(str);
4510 len = PyUnicode_GET_LENGTH(str);
4511
4512 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004513 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004514
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004515 /* It might be possible to tighten this worst case */
4516 allocated = 8 * len;
4517 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004518 return PyErr_NoMemory();
4519
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521 if (v == NULL)
4522 return NULL;
4523
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004524 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004525 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004526 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004527
Antoine Pitrou244651a2009-05-04 18:56:13 +00004528 if (inShift) {
4529 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4530 /* shifting out */
4531 if (base64bits) { /* output remaining bits */
4532 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4533 base64buffer = 0;
4534 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 }
4536 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004537 /* Characters not in the BASE64 set implicitly unshift the sequence
4538 so no '-' is required, except if the character is itself a '-' */
4539 if (IS_BASE64(ch) || ch == '-') {
4540 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 *out++ = (char) ch;
4543 }
4544 else {
4545 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004546 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004547 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004548 else { /* not in a shift sequence */
4549 if (ch == '+') {
4550 *out++ = '+';
4551 *out++ = '-';
4552 }
4553 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4554 *out++ = (char) ch;
4555 }
4556 else {
4557 *out++ = '+';
4558 inShift = 1;
4559 goto encode_char;
4560 }
4561 }
4562 continue;
4563encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004565 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004566
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 /* code first surrogate */
4568 base64bits += 16;
4569 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4570 while (base64bits >= 6) {
4571 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4572 base64bits -= 6;
4573 }
4574 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004575 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004577 base64bits += 16;
4578 base64buffer = (base64buffer << 16) | ch;
4579 while (base64bits >= 6) {
4580 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4581 base64bits -= 6;
4582 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004583 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 if (base64bits)
4585 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4586 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004587 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004588 if (_PyBytes_Resize(&v, out - start) < 0)
4589 return NULL;
4590 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004591}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004592PyObject *
4593PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4594 Py_ssize_t size,
4595 int base64SetO,
4596 int base64WhiteSpace,
4597 const char *errors)
4598{
4599 PyObject *result;
4600 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4601 if (tmp == NULL)
4602 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004603 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004604 base64WhiteSpace, errors);
4605 Py_DECREF(tmp);
4606 return result;
4607}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004608
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609#undef IS_BASE64
4610#undef FROM_BASE64
4611#undef TO_BASE64
4612#undef DECODE_DIRECT
4613#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004614
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615/* --- UTF-8 Codec -------------------------------------------------------- */
4616
Tim Petersced69f82003-09-16 20:30:58 +00004617static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004619 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4620 illegal prefix. See RFC 3629 for details */
4621 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4622 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004623 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4625 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4626 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4627 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004628 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4629 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004632 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4633 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4634 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4635 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4636 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637};
4638
Alexander Belopolsky40018472011-02-26 01:02:56 +00004639PyObject *
4640PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004641 Py_ssize_t size,
4642 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004643{
Walter Dörwald69652032004-09-07 20:24:22 +00004644 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4645}
4646
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004647#include "stringlib/ucs1lib.h"
4648#include "stringlib/codecs.h"
4649#include "stringlib/undef.h"
4650
4651#include "stringlib/ucs2lib.h"
4652#include "stringlib/codecs.h"
4653#include "stringlib/undef.h"
4654
4655#include "stringlib/ucs4lib.h"
4656#include "stringlib/codecs.h"
4657#include "stringlib/undef.h"
4658
Antoine Pitrouab868312009-01-10 15:40:25 +00004659/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4660#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4661
4662/* Mask to quickly check whether a C 'long' contains a
4663 non-ASCII, UTF8-encoded char. */
4664#if (SIZEOF_LONG == 8)
4665# define ASCII_CHAR_MASK 0x8080808080808080L
4666#elif (SIZEOF_LONG == 4)
4667# define ASCII_CHAR_MASK 0x80808080L
4668#else
4669# error C 'long' size should be either 4 or 8!
4670#endif
4671
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004672/* Scans a UTF-8 string and returns the maximum character to be expected
4673 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004674
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004675 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004676 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004677 */
4678static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004679utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004680{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004681 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004682 const unsigned char *end = p + string_size;
4683 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004684
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004685 assert(unicode_size != NULL);
4686
4687 /* By having a cascade of independent loops which fallback onto each
4688 other, we minimize the amount of work done in the average loop
4689 iteration, and we also maximize the CPU's ability to predict
4690 branches correctly (because a given condition will have always the
4691 same boolean outcome except perhaps in the last iteration of the
4692 corresponding loop).
4693 In the general case this brings us rather close to decoding
4694 performance pre-PEP 393, despite the two-pass decoding.
4695
4696 Note that the pure ASCII loop is not duplicated once a non-ASCII
4697 character has been encountered. It is actually a pessimization (by
4698 a significant factor) to use this loop on text with many non-ASCII
4699 characters, and it is important to avoid bad performance on valid
4700 utf-8 data (invalid utf-8 being a different can of worms).
4701 */
4702
4703 /* ASCII */
4704 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004705 /* Only check value if it's not a ASCII char... */
4706 if (*p < 0x80) {
4707 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4708 an explanation. */
4709 if (!((size_t) p & LONG_PTR_MASK)) {
4710 /* Help register allocation */
4711 register const unsigned char *_p = p;
4712 while (_p < aligned_end) {
4713 unsigned long value = *(unsigned long *) _p;
4714 if (value & ASCII_CHAR_MASK)
4715 break;
4716 _p += SIZEOF_LONG;
4717 char_count += SIZEOF_LONG;
4718 }
4719 p = _p;
4720 if (p == end)
4721 break;
4722 }
4723 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004724 if (*p < 0x80)
4725 ++char_count;
4726 else
4727 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004728 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004729 *unicode_size = char_count;
4730 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004731
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004732_ucs1loop:
4733 for (; p < end; ++p) {
4734 if (*p < 0xc4)
4735 char_count += ((*p & 0xc0) != 0x80);
4736 else
4737 goto _ucs2loop;
4738 }
4739 *unicode_size = char_count;
4740 return 255;
4741
4742_ucs2loop:
4743 for (; p < end; ++p) {
4744 if (*p < 0xf0)
4745 char_count += ((*p & 0xc0) != 0x80);
4746 else
4747 goto _ucs4loop;
4748 }
4749 *unicode_size = char_count;
4750 return 65535;
4751
4752_ucs4loop:
4753 for (; p < end; ++p) {
4754 char_count += ((*p & 0xc0) != 0x80);
4755 }
4756 *unicode_size = char_count;
4757 return 65537;
4758}
4759
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004760/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004761 in case of errors. Implicit parameters: unicode, kind, data, onError.
4762 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004763*/
Victor Stinner785938e2011-12-11 20:09:03 +01004764#define WRITE_MAYBE_FAIL(index, value) \
4765 do { \
4766 Py_ssize_t pos = index; \
4767 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4768 unicode_resize(&unicode, pos + pos/8) < 0) \
4769 goto onError; \
4770 if (unicode_putchar(&unicode, &pos, value) < 0) \
4771 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004772 } while (0)
4773
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004774static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004775decode_utf8_errors(const char *starts,
4776 Py_ssize_t size,
4777 const char *errors,
4778 Py_ssize_t *consumed,
4779 const char *s,
4780 PyObject *unicode,
4781 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004782{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004784 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004785 Py_ssize_t startinpos;
4786 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004787 const char *e = starts + size;
4788 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004789 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004790 PyObject *errorHandler = NULL;
4791 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004792
Antoine Pitrouab868312009-01-10 15:40:25 +00004793 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794
4795 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004796 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797
4798 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004799 /* Fast path for runs of ASCII characters. Given that common UTF-8
4800 input will consist of an overwhelming majority of ASCII
4801 characters, we try to optimize for this case by checking
4802 as many characters as a C 'long' can contain.
4803 First, check if we can do an aligned read, as most CPUs have
4804 a penalty for unaligned reads.
4805 */
4806 if (!((size_t) s & LONG_PTR_MASK)) {
4807 /* Help register allocation */
4808 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004809 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004810 while (_s < aligned_end) {
4811 /* Read a whole long at a time (either 4 or 8 bytes),
4812 and do a fast unrolled copy if it only contains ASCII
4813 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004814 unsigned long value = *(unsigned long *) _s;
4815 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004816 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004817 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4818 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4819 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4820 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004821#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004822 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4823 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4824 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4825 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004826#endif
4827 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004828 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004829 }
4830 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004831 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004832 if (s == e)
4833 break;
4834 ch = (unsigned char)*s;
4835 }
4836 }
4837
4838 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004839 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 s++;
4841 continue;
4842 }
4843
4844 n = utf8_code_length[ch];
4845
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004846 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004847 if (consumed)
4848 break;
4849 else {
4850 errmsg = "unexpected end of data";
4851 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004852 endinpos = startinpos+1;
4853 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4854 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004855 goto utf8Error;
4856 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004857 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858
4859 switch (n) {
4860
4861 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004862 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004863 startinpos = s-starts;
4864 endinpos = startinpos+1;
4865 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866
4867 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004868 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004869 startinpos = s-starts;
4870 endinpos = startinpos+1;
4871 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872
4873 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004874 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004875 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004876 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004877 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004878 goto utf8Error;
4879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004881 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004882 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883 break;
4884
4885 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004886 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4887 will result in surrogates in range d800-dfff. Surrogates are
4888 not valid UTF-8 so they are rejected.
4889 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4890 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004891 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004892 (s[2] & 0xc0) != 0x80 ||
4893 ((unsigned char)s[0] == 0xE0 &&
4894 (unsigned char)s[1] < 0xA0) ||
4895 ((unsigned char)s[0] == 0xED &&
4896 (unsigned char)s[1] > 0x9F)) {
4897 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004898 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004899 endinpos = startinpos + 1;
4900
4901 /* if s[1] first two bits are 1 and 0, then the invalid
4902 continuation byte is s[2], so increment endinpos by 1,
4903 if not, s[1] is invalid and endinpos doesn't need to
4904 be incremented. */
4905 if ((s[1] & 0xC0) == 0x80)
4906 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 goto utf8Error;
4908 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004909 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004910 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004911 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004912 break;
4913
4914 case 4:
4915 if ((s[1] & 0xc0) != 0x80 ||
4916 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004917 (s[3] & 0xc0) != 0x80 ||
4918 ((unsigned char)s[0] == 0xF0 &&
4919 (unsigned char)s[1] < 0x90) ||
4920 ((unsigned char)s[0] == 0xF4 &&
4921 (unsigned char)s[1] > 0x8F)) {
4922 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004923 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004924 endinpos = startinpos + 1;
4925 if ((s[1] & 0xC0) == 0x80) {
4926 endinpos++;
4927 if ((s[2] & 0xC0) == 0x80)
4928 endinpos++;
4929 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004930 goto utf8Error;
4931 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004932 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004933 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004934 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004935
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004936 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938 }
4939 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004941
Benjamin Peterson29060642009-01-31 22:14:21 +00004942 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004943 if (unicode_decode_call_errorhandler(
4944 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01004945 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004947 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004948 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004949 /* Update data because unicode_decode_call_errorhandler might have
4950 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004951 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 }
Walter Dörwald69652032004-09-07 20:24:22 +00004953 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004956 /* Adjust length and ready string when it contained errors and
4957 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004958 if (unicode_resize(&unicode, i) < 0)
4959 goto onError;
4960 unicode_adjust_maxchar(&unicode);
4961 if (unicode == NULL)
4962 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004964 Py_XDECREF(errorHandler);
4965 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004966 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004967 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004970 Py_XDECREF(errorHandler);
4971 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004972 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973 return NULL;
4974}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004975#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004976
Victor Stinner785938e2011-12-11 20:09:03 +01004977PyObject *
4978PyUnicode_DecodeUTF8Stateful(const char *s,
4979 Py_ssize_t size,
4980 const char *errors,
4981 Py_ssize_t *consumed)
4982{
4983 Py_UCS4 maxchar = 0;
4984 Py_ssize_t unicode_size;
4985 int has_errors = 0;
4986 PyObject *unicode;
4987 int kind;
4988 void *data;
4989 const char *starts = s;
4990 const char *e;
4991 Py_ssize_t i;
4992
4993 if (size == 0) {
4994 if (consumed)
4995 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004996 Py_INCREF(unicode_empty);
4997 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004998 }
4999
Victor Stinnera1d12bb2011-12-11 21:53:09 +01005000 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01005001
5002 /* When the string is ASCII only, just use memcpy and return.
5003 unicode_size may be != size if there is an incomplete UTF-8
5004 sequence at the end of the ASCII block. */
5005 if (maxchar < 128 && size == unicode_size) {
5006 if (consumed)
5007 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01005008 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01005009 }
5010
5011 unicode = PyUnicode_New(unicode_size, maxchar);
5012 if (!unicode)
5013 return NULL;
5014 kind = PyUnicode_KIND(unicode);
5015 data = PyUnicode_DATA(unicode);
5016
5017 /* Unpack UTF-8 encoded data */
5018 i = 0;
5019 e = starts + size;
5020 switch (kind) {
5021 case PyUnicode_1BYTE_KIND:
5022 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
5023 break;
5024 case PyUnicode_2BYTE_KIND:
5025 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
5026 break;
5027 case PyUnicode_4BYTE_KIND:
5028 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
5029 break;
5030 }
5031 if (!has_errors) {
5032 /* Ensure the unicode size calculation was correct */
5033 assert(i == unicode_size);
5034 assert(s == e);
5035 if (consumed)
5036 *consumed = size;
5037 return unicode;
5038 }
5039
5040 /* In case of errors, maxchar and size computation might be incorrect;
5041 code below refits and resizes as necessary. */
5042 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
5043}
5044
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005045#ifdef __APPLE__
5046
5047/* Simplified UTF-8 decoder using surrogateescape error handler,
5048 used to decode the command line arguments on Mac OS X. */
5049
5050wchar_t*
5051_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5052{
5053 int n;
5054 const char *e;
5055 wchar_t *unicode, *p;
5056
5057 /* Note: size will always be longer than the resulting Unicode
5058 character count */
5059 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
5060 PyErr_NoMemory();
5061 return NULL;
5062 }
5063 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
5064 if (!unicode)
5065 return NULL;
5066
5067 /* Unpack UTF-8 encoded data */
5068 p = unicode;
5069 e = s + size;
5070 while (s < e) {
5071 Py_UCS4 ch = (unsigned char)*s;
5072
5073 if (ch < 0x80) {
5074 *p++ = (wchar_t)ch;
5075 s++;
5076 continue;
5077 }
5078
5079 n = utf8_code_length[ch];
5080 if (s + n > e) {
5081 goto surrogateescape;
5082 }
5083
5084 switch (n) {
5085 case 0:
5086 case 1:
5087 goto surrogateescape;
5088
5089 case 2:
5090 if ((s[1] & 0xc0) != 0x80)
5091 goto surrogateescape;
5092 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
5093 assert ((ch > 0x007F) && (ch <= 0x07FF));
5094 *p++ = (wchar_t)ch;
5095 break;
5096
5097 case 3:
5098 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
5099 will result in surrogates in range d800-dfff. Surrogates are
5100 not valid UTF-8 so they are rejected.
5101 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
5102 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
5103 if ((s[1] & 0xc0) != 0x80 ||
5104 (s[2] & 0xc0) != 0x80 ||
5105 ((unsigned char)s[0] == 0xE0 &&
5106 (unsigned char)s[1] < 0xA0) ||
5107 ((unsigned char)s[0] == 0xED &&
5108 (unsigned char)s[1] > 0x9F)) {
5109
5110 goto surrogateescape;
5111 }
5112 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5113 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005114 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005115 break;
5116
5117 case 4:
5118 if ((s[1] & 0xc0) != 0x80 ||
5119 (s[2] & 0xc0) != 0x80 ||
5120 (s[3] & 0xc0) != 0x80 ||
5121 ((unsigned char)s[0] == 0xF0 &&
5122 (unsigned char)s[1] < 0x90) ||
5123 ((unsigned char)s[0] == 0xF4 &&
5124 (unsigned char)s[1] > 0x8F)) {
5125 goto surrogateescape;
5126 }
5127 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
5128 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01005129 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005130
5131#if SIZEOF_WCHAR_T == 4
5132 *p++ = (wchar_t)ch;
5133#else
5134 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01005135 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5136 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005137#endif
5138 break;
5139 }
5140 s += n;
5141 continue;
5142
5143 surrogateescape:
5144 *p++ = 0xDC00 + ch;
5145 s++;
5146 }
5147 *p = L'\0';
5148 return unicode;
5149}
5150
5151#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005153/* Primary internal function which creates utf8 encoded bytes objects.
5154
5155 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005156 and allocate exactly as much space needed at the end. Else allocate the
5157 maximum possible needed (4 result bytes per Unicode character), and return
5158 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005159*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005160PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005161_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162{
Victor Stinner6099a032011-12-18 14:22:26 +01005163 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005164 void *data;
5165 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005167 if (!PyUnicode_Check(unicode)) {
5168 PyErr_BadArgument();
5169 return NULL;
5170 }
5171
5172 if (PyUnicode_READY(unicode) == -1)
5173 return NULL;
5174
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005175 if (PyUnicode_UTF8(unicode))
5176 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5177 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005178
5179 kind = PyUnicode_KIND(unicode);
5180 data = PyUnicode_DATA(unicode);
5181 size = PyUnicode_GET_LENGTH(unicode);
5182
Benjamin Petersonead6b532011-12-20 17:23:42 -06005183 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005184 default:
5185 assert(0);
5186 case PyUnicode_1BYTE_KIND:
5187 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5188 assert(!PyUnicode_IS_ASCII(unicode));
5189 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5190 case PyUnicode_2BYTE_KIND:
5191 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5192 case PyUnicode_4BYTE_KIND:
5193 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195}
5196
Alexander Belopolsky40018472011-02-26 01:02:56 +00005197PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005198PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5199 Py_ssize_t size,
5200 const char *errors)
5201{
5202 PyObject *v, *unicode;
5203
5204 unicode = PyUnicode_FromUnicode(s, size);
5205 if (unicode == NULL)
5206 return NULL;
5207 v = _PyUnicode_AsUTF8String(unicode, errors);
5208 Py_DECREF(unicode);
5209 return v;
5210}
5211
5212PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005213PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005215 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216}
5217
Walter Dörwald41980ca2007-08-16 21:55:45 +00005218/* --- UTF-32 Codec ------------------------------------------------------- */
5219
5220PyObject *
5221PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005222 Py_ssize_t size,
5223 const char *errors,
5224 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005225{
5226 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5227}
5228
5229PyObject *
5230PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005231 Py_ssize_t size,
5232 const char *errors,
5233 int *byteorder,
5234 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005235{
5236 const char *starts = s;
5237 Py_ssize_t startinpos;
5238 Py_ssize_t endinpos;
5239 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005240 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005241 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005242 int bo = 0; /* assume native ordering by default */
5243 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005244 /* Offsets from q for retrieving bytes in the right order. */
5245#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5246 int iorder[] = {0, 1, 2, 3};
5247#else
5248 int iorder[] = {3, 2, 1, 0};
5249#endif
5250 PyObject *errorHandler = NULL;
5251 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005252
Walter Dörwald41980ca2007-08-16 21:55:45 +00005253 q = (unsigned char *)s;
5254 e = q + size;
5255
5256 if (byteorder)
5257 bo = *byteorder;
5258
5259 /* Check for BOM marks (U+FEFF) in the input and adjust current
5260 byte order setting accordingly. In native mode, the leading BOM
5261 mark is skipped, in all other modes, it is copied to the output
5262 stream as-is (giving a ZWNBSP character). */
5263 if (bo == 0) {
5264 if (size >= 4) {
5265 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005266 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005267#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005268 if (bom == 0x0000FEFF) {
5269 q += 4;
5270 bo = -1;
5271 }
5272 else if (bom == 0xFFFE0000) {
5273 q += 4;
5274 bo = 1;
5275 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005276#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005277 if (bom == 0x0000FEFF) {
5278 q += 4;
5279 bo = 1;
5280 }
5281 else if (bom == 0xFFFE0000) {
5282 q += 4;
5283 bo = -1;
5284 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005285#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005286 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005287 }
5288
5289 if (bo == -1) {
5290 /* force LE */
5291 iorder[0] = 0;
5292 iorder[1] = 1;
5293 iorder[2] = 2;
5294 iorder[3] = 3;
5295 }
5296 else if (bo == 1) {
5297 /* force BE */
5298 iorder[0] = 3;
5299 iorder[1] = 2;
5300 iorder[2] = 1;
5301 iorder[3] = 0;
5302 }
5303
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005304 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005305 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005306 if (!unicode)
5307 return NULL;
5308 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005309 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005310 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005311
Walter Dörwald41980ca2007-08-16 21:55:45 +00005312 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005313 Py_UCS4 ch;
5314 /* remaining bytes at the end? (size should be divisible by 4) */
5315 if (e-q<4) {
5316 if (consumed)
5317 break;
5318 errmsg = "truncated data";
5319 startinpos = ((const char *)q)-starts;
5320 endinpos = ((const char *)e)-starts;
5321 goto utf32Error;
5322 /* The remaining input chars are ignored if the callback
5323 chooses to skip the input */
5324 }
5325 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5326 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005327
Benjamin Peterson29060642009-01-31 22:14:21 +00005328 if (ch >= 0x110000)
5329 {
5330 errmsg = "codepoint not in range(0x110000)";
5331 startinpos = ((const char *)q)-starts;
5332 endinpos = startinpos+4;
5333 goto utf32Error;
5334 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005335 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5336 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 q += 4;
5338 continue;
5339 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005340 if (unicode_decode_call_errorhandler(
5341 errors, &errorHandler,
5342 "utf32", errmsg,
5343 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005344 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005346 }
5347
5348 if (byteorder)
5349 *byteorder = bo;
5350
5351 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005353
5354 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005355 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005356 goto onError;
5357
5358 Py_XDECREF(errorHandler);
5359 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005360 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005361
Benjamin Peterson29060642009-01-31 22:14:21 +00005362 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005363 Py_DECREF(unicode);
5364 Py_XDECREF(errorHandler);
5365 Py_XDECREF(exc);
5366 return NULL;
5367}
5368
5369PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005370_PyUnicode_EncodeUTF32(PyObject *str,
5371 const char *errors,
5372 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005373{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005374 int kind;
5375 void *data;
5376 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005377 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005378 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005379 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005380 /* Offsets from p for storing byte pairs in the right order. */
5381#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5382 int iorder[] = {0, 1, 2, 3};
5383#else
5384 int iorder[] = {3, 2, 1, 0};
5385#endif
5386
Benjamin Peterson29060642009-01-31 22:14:21 +00005387#define STORECHAR(CH) \
5388 do { \
5389 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5390 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5391 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5392 p[iorder[0]] = (CH) & 0xff; \
5393 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005394 } while(0)
5395
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005396 if (!PyUnicode_Check(str)) {
5397 PyErr_BadArgument();
5398 return NULL;
5399 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005400 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005401 return NULL;
5402 kind = PyUnicode_KIND(str);
5403 data = PyUnicode_DATA(str);
5404 len = PyUnicode_GET_LENGTH(str);
5405
5406 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005407 bytesize = nsize * 4;
5408 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005409 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005410 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005411 if (v == NULL)
5412 return NULL;
5413
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005414 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005415 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005416 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005417 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005418 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005419
5420 if (byteorder == -1) {
5421 /* force LE */
5422 iorder[0] = 0;
5423 iorder[1] = 1;
5424 iorder[2] = 2;
5425 iorder[3] = 3;
5426 }
5427 else if (byteorder == 1) {
5428 /* force BE */
5429 iorder[0] = 3;
5430 iorder[1] = 2;
5431 iorder[2] = 1;
5432 iorder[3] = 0;
5433 }
5434
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005435 for (i = 0; i < len; i++)
5436 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005437
5438 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005439 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005440#undef STORECHAR
5441}
5442
Alexander Belopolsky40018472011-02-26 01:02:56 +00005443PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005444PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5445 Py_ssize_t size,
5446 const char *errors,
5447 int byteorder)
5448{
5449 PyObject *result;
5450 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5451 if (tmp == NULL)
5452 return NULL;
5453 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5454 Py_DECREF(tmp);
5455 return result;
5456}
5457
5458PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005459PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005460{
Victor Stinnerb960b342011-11-20 19:12:52 +01005461 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005462}
5463
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464/* --- UTF-16 Codec ------------------------------------------------------- */
5465
Tim Peters772747b2001-08-09 22:21:55 +00005466PyObject *
5467PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005468 Py_ssize_t size,
5469 const char *errors,
5470 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471{
Walter Dörwald69652032004-09-07 20:24:22 +00005472 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5473}
5474
Antoine Pitrouab868312009-01-10 15:40:25 +00005475/* Two masks for fast checking of whether a C 'long' may contain
5476 UTF16-encoded surrogate characters. This is an efficient heuristic,
5477 assuming that non-surrogate characters with a code point >= 0x8000 are
5478 rare in most input.
5479 FAST_CHAR_MASK is used when the input is in native byte ordering,
5480 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005481*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005482#if (SIZEOF_LONG == 8)
5483# define FAST_CHAR_MASK 0x8000800080008000L
5484# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005485# define STRIPPED_MASK 0x00FF00FF00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005486#elif (SIZEOF_LONG == 4)
5487# define FAST_CHAR_MASK 0x80008000L
5488# define SWAPPED_FAST_CHAR_MASK 0x00800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005489# define STRIPPED_MASK 0x00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005490#else
5491# error C 'long' size should be either 4 or 8!
5492#endif
5493
Walter Dörwald69652032004-09-07 20:24:22 +00005494PyObject *
5495PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 Py_ssize_t size,
5497 const char *errors,
5498 int *byteorder,
5499 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005500{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005501 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005502 Py_ssize_t startinpos;
5503 Py_ssize_t endinpos;
5504 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005505 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005506 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005507 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005508 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005509 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005510 /* Offsets from q for retrieving byte pairs in the right order. */
5511#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5512 int ihi = 1, ilo = 0;
5513#else
5514 int ihi = 0, ilo = 1;
5515#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005516 PyObject *errorHandler = NULL;
5517 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518
5519 /* Note: size will always be longer than the resulting Unicode
5520 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005521 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 if (!unicode)
5523 return NULL;
5524 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005525 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005526 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527
Tim Peters772747b2001-08-09 22:21:55 +00005528 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005529 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530
5531 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005532 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005534 /* Check for BOM marks (U+FEFF) in the input and adjust current
5535 byte order setting accordingly. In native mode, the leading BOM
5536 mark is skipped, in all other modes, it is copied to the output
5537 stream as-is (giving a ZWNBSP character). */
5538 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005539 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005540 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005541#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005542 if (bom == 0xFEFF) {
5543 q += 2;
5544 bo = -1;
5545 }
5546 else if (bom == 0xFFFE) {
5547 q += 2;
5548 bo = 1;
5549 }
Tim Petersced69f82003-09-16 20:30:58 +00005550#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005551 if (bom == 0xFEFF) {
5552 q += 2;
5553 bo = 1;
5554 }
5555 else if (bom == 0xFFFE) {
5556 q += 2;
5557 bo = -1;
5558 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005559#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005560 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562
Tim Peters772747b2001-08-09 22:21:55 +00005563 if (bo == -1) {
5564 /* force LE */
5565 ihi = 1;
5566 ilo = 0;
5567 }
5568 else if (bo == 1) {
5569 /* force BE */
5570 ihi = 0;
5571 ilo = 1;
5572 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005573#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5574 native_ordering = ilo < ihi;
5575#else
5576 native_ordering = ilo > ihi;
5577#endif
Tim Peters772747b2001-08-09 22:21:55 +00005578
Antoine Pitrouab868312009-01-10 15:40:25 +00005579 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005580 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005581 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005582 /* First check for possible aligned read of a C 'long'. Unaligned
5583 reads are more expensive, better to defer to another iteration. */
5584 if (!((size_t) q & LONG_PTR_MASK)) {
5585 /* Fast path for runs of non-surrogate chars. */
5586 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005587 int kind = PyUnicode_KIND(unicode);
5588 void *data = PyUnicode_DATA(unicode);
5589 while (_q < aligned_end) {
Victor Stinnerafb52052012-04-05 22:54:49 +02005590 unsigned long block = * (unsigned long *) _q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005591 Py_UCS4 maxch;
5592 if (native_ordering) {
5593 /* Can use buffer directly */
Victor Stinnerafb52052012-04-05 22:54:49 +02005594 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005595 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005596 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005597 else {
5598 /* Need to byte-swap */
Victor Stinnerafb52052012-04-05 22:54:49 +02005599 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005600 break;
Victor Stinnerafb52052012-04-05 22:54:49 +02005601 block = ((block >> 8) & STRIPPED_MASK) |
5602 ((block & STRIPPED_MASK) << 8);
Antoine Pitrouab868312009-01-10 15:40:25 +00005603 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005604 maxch = (Py_UCS2)(block & 0xFFFF);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005605#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005606 ch = (Py_UCS2)((block >> 16) & 0xFFFF);
Victor Stinnere6abb482012-05-02 01:15:40 +02005607 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005608 ch = (Py_UCS2)((block >> 32) & 0xFFFF);
Victor Stinnere6abb482012-05-02 01:15:40 +02005609 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005610 ch = (Py_UCS2)(block >> 48);
Victor Stinnere6abb482012-05-02 01:15:40 +02005611 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005612#else
5613 ch = (Py_UCS2)(block >> 16);
Victor Stinnere6abb482012-05-02 01:15:40 +02005614 maxch = MAX_MAXCHAR(maxch, ch);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005615#endif
5616 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
Victor Stinner1b487b42012-05-03 12:29:04 +02005617 if (unicode_widen(&unicode, outpos, maxch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005618 goto onError;
5619 kind = PyUnicode_KIND(unicode);
5620 data = PyUnicode_DATA(unicode);
5621 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005622#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5623 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005624#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005625 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5626 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5627 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5628#else
5629 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5630#endif
5631#else
5632#if SIZEOF_LONG == 8
5633 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5634 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5635 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5636#else
5637 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5638#endif
5639 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005640#endif
5641 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005642 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005643 q = _q;
5644 if (q >= e)
5645 break;
5646 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005647 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648
Benjamin Peterson14339b62009-01-31 16:36:08 +00005649 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005650
Victor Stinner551ac952011-11-29 22:58:13 +01005651 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005652 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5653 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005654 continue;
5655 }
5656
5657 /* UTF-16 code pair: */
5658 if (q > e) {
5659 errmsg = "unexpected end of data";
5660 startinpos = (((const char *)q) - 2) - starts;
5661 endinpos = ((const char *)e) + 1 - starts;
5662 goto utf16Error;
5663 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005664 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5665 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005666 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005667 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005668 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005669 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005670 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005671 continue;
5672 }
5673 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005674 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005675 startinpos = (((const char *)q)-4)-starts;
5676 endinpos = startinpos+2;
5677 goto utf16Error;
5678 }
5679
Benjamin Peterson14339b62009-01-31 16:36:08 +00005680 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005681 errmsg = "illegal encoding";
5682 startinpos = (((const char *)q)-2)-starts;
5683 endinpos = startinpos+2;
5684 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005685
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005687 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005688 errors,
5689 &errorHandler,
5690 "utf16", errmsg,
5691 &starts,
5692 (const char **)&e,
5693 &startinpos,
5694 &endinpos,
5695 &exc,
5696 (const char **)&q,
5697 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005698 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005699 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005701 /* remaining byte at the end? (size should be even) */
5702 if (e == q) {
5703 if (!consumed) {
5704 errmsg = "truncated data";
5705 startinpos = ((const char *)q) - starts;
5706 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005707 if (unicode_decode_call_errorhandler(
5708 errors,
5709 &errorHandler,
5710 "utf16", errmsg,
5711 &starts,
5712 (const char **)&e,
5713 &startinpos,
5714 &endinpos,
5715 &exc,
5716 (const char **)&q,
5717 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005718 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005719 goto onError;
5720 /* The remaining input chars are ignored if the callback
5721 chooses to skip the input */
5722 }
5723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724
5725 if (byteorder)
5726 *byteorder = bo;
5727
Walter Dörwald69652032004-09-07 20:24:22 +00005728 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005730
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005732 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733 goto onError;
5734
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 Py_XDECREF(errorHandler);
5736 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005737 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741 Py_XDECREF(errorHandler);
5742 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 return NULL;
5744}
5745
Antoine Pitrouab868312009-01-10 15:40:25 +00005746#undef FAST_CHAR_MASK
5747#undef SWAPPED_FAST_CHAR_MASK
5748
Tim Peters772747b2001-08-09 22:21:55 +00005749PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005750_PyUnicode_EncodeUTF16(PyObject *str,
5751 const char *errors,
5752 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005754 int kind;
5755 void *data;
5756 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005757 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005758 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005759 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005760 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005761 /* Offsets from p for storing byte pairs in the right order. */
5762#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5763 int ihi = 1, ilo = 0;
5764#else
5765 int ihi = 0, ilo = 1;
5766#endif
5767
Benjamin Peterson29060642009-01-31 22:14:21 +00005768#define STORECHAR(CH) \
5769 do { \
5770 p[ihi] = ((CH) >> 8) & 0xff; \
5771 p[ilo] = (CH) & 0xff; \
5772 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005773 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005775 if (!PyUnicode_Check(str)) {
5776 PyErr_BadArgument();
5777 return NULL;
5778 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005779 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005780 return NULL;
5781 kind = PyUnicode_KIND(str);
5782 data = PyUnicode_DATA(str);
5783 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005784
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005785 pairs = 0;
5786 if (kind == PyUnicode_4BYTE_KIND)
5787 for (i = 0; i < len; i++)
5788 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5789 pairs++;
5790 /* 2 * (len + pairs + (byteorder == 0)) */
5791 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005792 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005793 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005794 bytesize = nsize * 2;
5795 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005797 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798 if (v == NULL)
5799 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005801 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005804 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005805 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005806
5807 if (byteorder == -1) {
5808 /* force LE */
5809 ihi = 1;
5810 ilo = 0;
5811 }
5812 else if (byteorder == 1) {
5813 /* force BE */
5814 ihi = 0;
5815 ilo = 1;
5816 }
5817
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005818 for (i = 0; i < len; i++) {
5819 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5820 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005822 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5823 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 }
Tim Peters772747b2001-08-09 22:21:55 +00005825 STORECHAR(ch);
5826 if (ch2)
5827 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005828 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005829
5830 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005831 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005832#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833}
5834
Alexander Belopolsky40018472011-02-26 01:02:56 +00005835PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005836PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5837 Py_ssize_t size,
5838 const char *errors,
5839 int byteorder)
5840{
5841 PyObject *result;
5842 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5843 if (tmp == NULL)
5844 return NULL;
5845 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5846 Py_DECREF(tmp);
5847 return result;
5848}
5849
5850PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005851PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005853 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854}
5855
5856/* --- Unicode Escape Codec ----------------------------------------------- */
5857
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005858/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5859 if all the escapes in the string make it still a valid ASCII string.
5860 Returns -1 if any escapes were found which cause the string to
5861 pop out of ASCII range. Otherwise returns the length of the
5862 required buffer to hold the string.
5863 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005864static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005865length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5866{
5867 const unsigned char *p = (const unsigned char *)s;
5868 const unsigned char *end = p + size;
5869 Py_ssize_t length = 0;
5870
5871 if (size < 0)
5872 return -1;
5873
5874 for (; p < end; ++p) {
5875 if (*p > 127) {
5876 /* Non-ASCII */
5877 return -1;
5878 }
5879 else if (*p != '\\') {
5880 /* Normal character */
5881 ++length;
5882 }
5883 else {
5884 /* Backslash-escape, check next char */
5885 ++p;
5886 /* Escape sequence reaches till end of string or
5887 non-ASCII follow-up. */
5888 if (p >= end || *p > 127)
5889 return -1;
5890 switch (*p) {
5891 case '\n':
5892 /* backslash + \n result in zero characters */
5893 break;
5894 case '\\': case '\'': case '\"':
5895 case 'b': case 'f': case 't':
5896 case 'n': case 'r': case 'v': case 'a':
5897 ++length;
5898 break;
5899 case '0': case '1': case '2': case '3':
5900 case '4': case '5': case '6': case '7':
5901 case 'x': case 'u': case 'U': case 'N':
5902 /* these do not guarantee ASCII characters */
5903 return -1;
5904 default:
5905 /* count the backslash + the other character */
5906 length += 2;
5907 }
5908 }
5909 }
5910 return length;
5911}
5912
Fredrik Lundh06d12682001-01-24 07:59:11 +00005913static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005914
Alexander Belopolsky40018472011-02-26 01:02:56 +00005915PyObject *
5916PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005917 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005918 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005920 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005921 Py_ssize_t startinpos;
5922 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005923 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005924 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005926 char* message;
5927 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005928 PyObject *errorHandler = NULL;
5929 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005930 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005931 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005932
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005933 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005934
5935 /* After length_of_escaped_ascii_string() there are two alternatives,
5936 either the string is pure ASCII with named escapes like \n, etc.
5937 and we determined it's exact size (common case)
5938 or it contains \x, \u, ... escape sequences. then we create a
5939 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005940 if (len >= 0) {
5941 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005942 if (!v)
5943 goto onError;
5944 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005945 }
5946 else {
5947 /* Escaped strings will always be longer than the resulting
5948 Unicode string, so we start with size here and then reduce the
5949 length after conversion to the true value.
5950 (but if the error callback returns a long replacement string
5951 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005952 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005953 if (!v)
5954 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005955 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005956 }
5957
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005959 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005960 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005962
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963 while (s < end) {
5964 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005965 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005966 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005968 /* The only case in which i == ascii_length is a backslash
5969 followed by a newline. */
5970 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005971
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972 /* Non-escape characters are interpreted as Unicode ordinals */
5973 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005974 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5975 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976 continue;
5977 }
5978
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005979 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 /* \ - Escapes */
5981 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005982 c = *s++;
5983 if (s > end)
5984 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005985
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005986 /* The only case in which i == ascii_length is a backslash
5987 followed by a newline. */
5988 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005989
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005990 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005993#define WRITECHAR(ch) \
5994 do { \
5995 if (unicode_putchar(&v, &i, ch) < 0) \
5996 goto onError; \
5997 }while(0)
5998
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006000 case '\\': WRITECHAR('\\'); break;
6001 case '\'': WRITECHAR('\''); break;
6002 case '\"': WRITECHAR('\"'); break;
6003 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006004 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006005 case 'f': WRITECHAR('\014'); break;
6006 case 't': WRITECHAR('\t'); break;
6007 case 'n': WRITECHAR('\n'); break;
6008 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006009 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006010 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006011 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006012 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 case '0': case '1': case '2': case '3':
6016 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006017 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006018 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006019 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006020 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006021 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006023 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024 break;
6025
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 /* hex escapes */
6027 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006029 digits = 2;
6030 message = "truncated \\xXX escape";
6031 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006035 digits = 4;
6036 message = "truncated \\uXXXX escape";
6037 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038
Benjamin Peterson29060642009-01-31 22:14:21 +00006039 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006040 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006041 digits = 8;
6042 message = "truncated \\UXXXXXXXX escape";
6043 hexescape:
6044 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006045 if (s+digits>end) {
6046 endinpos = size;
6047 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 errors, &errorHandler,
6049 "unicodeescape", "end of string in escape sequence",
6050 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006051 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006052 goto onError;
6053 goto nextByte;
6054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006055 for (j = 0; j < digits; ++j) {
6056 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00006057 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006058 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006059 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 errors, &errorHandler,
6061 "unicodeescape", message,
6062 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006063 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006064 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006065 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006066 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006067 }
6068 chr = (chr<<4) & ~0xF;
6069 if (c >= '0' && c <= '9')
6070 chr += c - '0';
6071 else if (c >= 'a' && c <= 'f')
6072 chr += 10 + c - 'a';
6073 else
6074 chr += 10 + c - 'A';
6075 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006076 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00006077 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006078 /* _decoding_error will have already written into the
6079 target buffer. */
6080 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006081 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00006082 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01006083 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006084 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00006085 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006086 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006087 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006088 errors, &errorHandler,
6089 "unicodeescape", "illegal Unicode character",
6090 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006091 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006092 goto onError;
6093 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006094 break;
6095
Benjamin Peterson29060642009-01-31 22:14:21 +00006096 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006097 case 'N':
6098 message = "malformed \\N character escape";
6099 if (ucnhash_CAPI == NULL) {
6100 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006101 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6102 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006103 if (ucnhash_CAPI == NULL)
6104 goto ucnhashError;
6105 }
6106 if (*s == '{') {
6107 const char *start = s+1;
6108 /* look for the closing brace */
6109 while (*s != '}' && s < end)
6110 s++;
6111 if (s > start && s < end && *s == '}') {
6112 /* found a name. look it up in the unicode database */
6113 message = "unknown Unicode character name";
6114 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006115 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006116 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006117 goto store;
6118 }
6119 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006121 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006122 errors, &errorHandler,
6123 "unicodeescape", message,
6124 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006125 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006126 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006127 break;
6128
6129 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006130 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131 message = "\\ at end of string";
6132 s--;
6133 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 errors, &errorHandler,
6136 "unicodeescape", message,
6137 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006138 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00006139 goto onError;
6140 }
6141 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006142 WRITECHAR('\\');
6143 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006144 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006145 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006148 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006150#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006151
Victor Stinner16e6a802011-12-12 13:24:15 +01006152 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006153 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006154 Py_XDECREF(errorHandler);
6155 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006156 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00006157
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006159 PyErr_SetString(
6160 PyExc_UnicodeError,
6161 "\\N escapes not supported (can't load unicodedata module)"
6162 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006163 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006164 Py_XDECREF(errorHandler);
6165 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006166 return NULL;
6167
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006170 Py_XDECREF(errorHandler);
6171 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 return NULL;
6173}
6174
6175/* Return a Unicode-Escape string version of the Unicode object.
6176
6177 If quotes is true, the string is enclosed in u"" or u'' quotes as
6178 appropriate.
6179
6180*/
6181
Alexander Belopolsky40018472011-02-26 01:02:56 +00006182PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006186 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006188 int kind;
6189 void *data;
6190 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191
Thomas Wouters89f507f2006-12-13 04:49:30 +00006192 /* Initial allocation is based on the longest-possible unichr
6193 escape.
6194
6195 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6196 unichr, so in this case it's the longest unichr escape. In
6197 narrow (UTF-16) builds this is five chars per source unichr
6198 since there are two unichrs in the surrogate pair, so in narrow
6199 (UTF-16) builds it's not the longest unichr escape.
6200
6201 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6202 so in the narrow (UTF-16) build case it's the longest unichr
6203 escape.
6204 */
6205
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006206 if (!PyUnicode_Check(unicode)) {
6207 PyErr_BadArgument();
6208 return NULL;
6209 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006210 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006211 return NULL;
6212 len = PyUnicode_GET_LENGTH(unicode);
6213 kind = PyUnicode_KIND(unicode);
6214 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006215 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006216 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6217 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6218 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6219 }
6220
6221 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006222 return PyBytes_FromStringAndSize(NULL, 0);
6223
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006224 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006225 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006226
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006227 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006228 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006229 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 if (repr == NULL)
6232 return NULL;
6233
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006234 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006236 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006237 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006238
Walter Dörwald79e913e2007-05-12 11:08:06 +00006239 /* Escape backslashes */
6240 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 *p++ = '\\';
6242 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006243 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006244 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006245
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006246 /* Map 21-bit characters to '\U00xxxxxx' */
6247 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006248 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006249 *p++ = '\\';
6250 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006251 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6252 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6253 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6254 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6255 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6256 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6257 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6258 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006260 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006261
Guido van Rossumd57fd912000-03-10 22:53:23 +00006262 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006263 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264 *p++ = '\\';
6265 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006266 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6267 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6268 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6269 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006270 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006271
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006272 /* Map special whitespace to '\t', \n', '\r' */
6273 else if (ch == '\t') {
6274 *p++ = '\\';
6275 *p++ = 't';
6276 }
6277 else if (ch == '\n') {
6278 *p++ = '\\';
6279 *p++ = 'n';
6280 }
6281 else if (ch == '\r') {
6282 *p++ = '\\';
6283 *p++ = 'r';
6284 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006285
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006286 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006287 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006289 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006290 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6291 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006292 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006293
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 /* Copy everything else as-is */
6295 else
6296 *p++ = (char) ch;
6297 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006299 assert(p - PyBytes_AS_STRING(repr) > 0);
6300 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6301 return NULL;
6302 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303}
6304
Alexander Belopolsky40018472011-02-26 01:02:56 +00006305PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006306PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6307 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006309 PyObject *result;
6310 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6311 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006313 result = PyUnicode_AsUnicodeEscapeString(tmp);
6314 Py_DECREF(tmp);
6315 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316}
6317
6318/* --- Raw Unicode Escape Codec ------------------------------------------- */
6319
Alexander Belopolsky40018472011-02-26 01:02:56 +00006320PyObject *
6321PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006322 Py_ssize_t size,
6323 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006325 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006326 Py_ssize_t startinpos;
6327 Py_ssize_t endinpos;
6328 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006329 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006330 const char *end;
6331 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006332 PyObject *errorHandler = NULL;
6333 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006334
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335 /* Escaped strings will always be longer than the resulting
6336 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006337 length after conversion to the true value. (But decoding error
6338 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006339 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006343 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006344 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006345 end = s + size;
6346 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 unsigned char c;
6348 Py_UCS4 x;
6349 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006350 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 /* Non-escape characters are interpreted as Unicode ordinals */
6353 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006354 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6355 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006356 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006357 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 startinpos = s-starts;
6359
6360 /* \u-escapes are only interpreted iff the number of leading
6361 backslashes if odd */
6362 bs = s;
6363 for (;s < end;) {
6364 if (*s != '\\')
6365 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006366 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6367 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 }
6369 if (((s - bs) & 1) == 0 ||
6370 s >= end ||
6371 (*s != 'u' && *s != 'U')) {
6372 continue;
6373 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006374 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006375 count = *s=='u' ? 4 : 8;
6376 s++;
6377
6378 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 for (x = 0, i = 0; i < count; ++i, ++s) {
6380 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006381 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 endinpos = s-starts;
6383 if (unicode_decode_call_errorhandler(
6384 errors, &errorHandler,
6385 "rawunicodeescape", "truncated \\uXXXX",
6386 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006387 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006388 goto onError;
6389 goto nextByte;
6390 }
6391 x = (x<<4) & ~0xF;
6392 if (c >= '0' && c <= '9')
6393 x += c - '0';
6394 else if (c >= 'a' && c <= 'f')
6395 x += 10 + c - 'a';
6396 else
6397 x += 10 + c - 'A';
6398 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006399 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006400 if (unicode_putchar(&v, &outpos, x) < 0)
6401 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006402 } else {
6403 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006404 if (unicode_decode_call_errorhandler(
6405 errors, &errorHandler,
6406 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006407 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006408 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006410 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 nextByte:
6412 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006414 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006416 Py_XDECREF(errorHandler);
6417 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006418 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006419
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006422 Py_XDECREF(errorHandler);
6423 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 return NULL;
6425}
6426
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006427
Alexander Belopolsky40018472011-02-26 01:02:56 +00006428PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006429PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006431 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432 char *p;
6433 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006434 Py_ssize_t expandsize, pos;
6435 int kind;
6436 void *data;
6437 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006439 if (!PyUnicode_Check(unicode)) {
6440 PyErr_BadArgument();
6441 return NULL;
6442 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006443 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006444 return NULL;
6445 kind = PyUnicode_KIND(unicode);
6446 data = PyUnicode_DATA(unicode);
6447 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006448 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6449 bytes, and 1 byte characters 4. */
6450 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006451
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006452 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006454
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006455 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 if (repr == NULL)
6457 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006458 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006459 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006461 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006462 for (pos = 0; pos < len; pos++) {
6463 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 /* Map 32-bit characters to '\Uxxxxxxxx' */
6465 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006466 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006467 *p++ = '\\';
6468 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006469 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6470 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6471 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6472 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6473 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6474 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6475 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6476 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006477 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006478 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006479 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 *p++ = '\\';
6481 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006482 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6483 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6484 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6485 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 /* Copy everything else as-is */
6488 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 *p++ = (char) ch;
6490 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006491
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006492 assert(p > q);
6493 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006494 return NULL;
6495 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006496}
6497
Alexander Belopolsky40018472011-02-26 01:02:56 +00006498PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006499PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6500 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006502 PyObject *result;
6503 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6504 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006505 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006506 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6507 Py_DECREF(tmp);
6508 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509}
6510
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006511/* --- Unicode Internal Codec ------------------------------------------- */
6512
Alexander Belopolsky40018472011-02-26 01:02:56 +00006513PyObject *
6514_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006515 Py_ssize_t size,
6516 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006517{
6518 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006519 Py_ssize_t startinpos;
6520 Py_ssize_t endinpos;
6521 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006522 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006523 const char *end;
6524 const char *reason;
6525 PyObject *errorHandler = NULL;
6526 PyObject *exc = NULL;
6527
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006528 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006529 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006530 1))
6531 return NULL;
6532
Thomas Wouters89f507f2006-12-13 04:49:30 +00006533 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006534 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006535 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006537 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006538 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006539 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006540 end = s + size;
6541
6542 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006543 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006544 Py_UCS4 ch;
6545 /* We copy the raw representation one byte at a time because the
6546 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006547 ((char *) &uch)[0] = s[0];
6548 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006549#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006550 ((char *) &uch)[2] = s[2];
6551 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006552#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006553 ch = uch;
6554
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006555 /* We have to sanity check the raw data, otherwise doom looms for
6556 some malformed UCS-4 data. */
6557 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006558#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006559 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006560#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006561 end-s < Py_UNICODE_SIZE
6562 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006564 startinpos = s - starts;
6565 if (end-s < Py_UNICODE_SIZE) {
6566 endinpos = end-starts;
6567 reason = "truncated input";
6568 }
6569 else {
6570 endinpos = s - starts + Py_UNICODE_SIZE;
6571 reason = "illegal code point (> 0x10FFFF)";
6572 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006573 if (unicode_decode_call_errorhandler(
6574 errors, &errorHandler,
6575 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006576 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006577 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006578 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006579 continue;
6580 }
6581
6582 s += Py_UNICODE_SIZE;
6583#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006584 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006585 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006586 Py_UNICODE uch2;
6587 ((char *) &uch2)[0] = s[0];
6588 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006589 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006590 {
Victor Stinner551ac952011-11-29 22:58:13 +01006591 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006592 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006593 }
6594 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006595#endif
6596
6597 if (unicode_putchar(&v, &outpos, ch) < 0)
6598 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006599 }
6600
Victor Stinner16e6a802011-12-12 13:24:15 +01006601 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006602 goto onError;
6603 Py_XDECREF(errorHandler);
6604 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006605 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006606
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006608 Py_XDECREF(v);
6609 Py_XDECREF(errorHandler);
6610 Py_XDECREF(exc);
6611 return NULL;
6612}
6613
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614/* --- Latin-1 Codec ------------------------------------------------------ */
6615
Alexander Belopolsky40018472011-02-26 01:02:56 +00006616PyObject *
6617PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006618 Py_ssize_t size,
6619 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006622 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623}
6624
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006625/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006626static void
6627make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006628 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006629 PyObject *unicode,
6630 Py_ssize_t startpos, Py_ssize_t endpos,
6631 const char *reason)
6632{
6633 if (*exceptionObject == NULL) {
6634 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006635 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006636 encoding, unicode, startpos, endpos, reason);
6637 }
6638 else {
6639 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6640 goto onError;
6641 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6642 goto onError;
6643 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6644 goto onError;
6645 return;
6646 onError:
6647 Py_DECREF(*exceptionObject);
6648 *exceptionObject = NULL;
6649 }
6650}
6651
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006652/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006653static void
6654raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006655 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006656 PyObject *unicode,
6657 Py_ssize_t startpos, Py_ssize_t endpos,
6658 const char *reason)
6659{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006660 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006661 encoding, unicode, startpos, endpos, reason);
6662 if (*exceptionObject != NULL)
6663 PyCodec_StrictErrors(*exceptionObject);
6664}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006665
6666/* error handling callback helper:
6667 build arguments, call the callback and check the arguments,
6668 put the result into newpos and return the replacement string, which
6669 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006670static PyObject *
6671unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006672 PyObject **errorHandler,
6673 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006675 Py_ssize_t startpos, Py_ssize_t endpos,
6676 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006677{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006678 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006679 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006680 PyObject *restuple;
6681 PyObject *resunicode;
6682
6683 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006684 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006687 }
6688
Benjamin Petersonbac79492012-01-14 13:34:47 -05006689 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006690 return NULL;
6691 len = PyUnicode_GET_LENGTH(unicode);
6692
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006693 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006694 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006697
6698 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006699 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006700 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006701 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006702 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006703 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006704 Py_DECREF(restuple);
6705 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006706 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006707 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006708 &resunicode, newpos)) {
6709 Py_DECREF(restuple);
6710 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006711 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006712 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6713 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6714 Py_DECREF(restuple);
6715 return NULL;
6716 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006718 *newpos = len + *newpos;
6719 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006720 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6721 Py_DECREF(restuple);
6722 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006723 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006724 Py_INCREF(resunicode);
6725 Py_DECREF(restuple);
6726 return resunicode;
6727}
6728
Alexander Belopolsky40018472011-02-26 01:02:56 +00006729static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006730unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006731 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006732 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006733{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006734 /* input state */
6735 Py_ssize_t pos=0, size;
6736 int kind;
6737 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006738 /* output object */
6739 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006740 /* pointer into the output */
6741 char *str;
6742 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006743 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006744 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6745 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006746 PyObject *errorHandler = NULL;
6747 PyObject *exc = NULL;
6748 /* the following variable is used for caching string comparisons
6749 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6750 int known_errorHandler = -1;
6751
Benjamin Petersonbac79492012-01-14 13:34:47 -05006752 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006753 return NULL;
6754 size = PyUnicode_GET_LENGTH(unicode);
6755 kind = PyUnicode_KIND(unicode);
6756 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006757 /* allocate enough for a simple encoding without
6758 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006759 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006760 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006761 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006762 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006763 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006764 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006765 ressize = size;
6766
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006767 while (pos < size) {
6768 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006769
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 /* can we encode this? */
6771 if (c<limit) {
6772 /* no overflow check, because we know that the space is enough */
6773 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006774 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006775 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 Py_ssize_t requiredsize;
6778 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006779 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006780 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006781 Py_ssize_t collstart = pos;
6782 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006783 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006784 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006785 ++collend;
6786 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6787 if (known_errorHandler==-1) {
6788 if ((errors==NULL) || (!strcmp(errors, "strict")))
6789 known_errorHandler = 1;
6790 else if (!strcmp(errors, "replace"))
6791 known_errorHandler = 2;
6792 else if (!strcmp(errors, "ignore"))
6793 known_errorHandler = 3;
6794 else if (!strcmp(errors, "xmlcharrefreplace"))
6795 known_errorHandler = 4;
6796 else
6797 known_errorHandler = 0;
6798 }
6799 switch (known_errorHandler) {
6800 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006801 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006802 goto onError;
6803 case 2: /* replace */
6804 while (collstart++<collend)
6805 *str++ = '?'; /* fall through */
6806 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006807 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 break;
6809 case 4: /* xmlcharrefreplace */
6810 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006811 /* determine replacement size */
6812 for (i = collstart, repsize = 0; i < collend; ++i) {
6813 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6814 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006816 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006817 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006818 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006819 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006820 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006822 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006824 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006826 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006827 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006829 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006831 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 if (requiredsize > ressize) {
6833 if (requiredsize<2*ressize)
6834 requiredsize = 2*ressize;
6835 if (_PyBytes_Resize(&res, requiredsize))
6836 goto onError;
6837 str = PyBytes_AS_STRING(res) + respos;
6838 ressize = requiredsize;
6839 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006840 /* generate replacement */
6841 for (i = collstart; i < collend; ++i) {
6842 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006843 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006844 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006845 break;
6846 default:
6847 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006848 encoding, reason, unicode, &exc,
6849 collstart, collend, &newpos);
6850 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006851 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006852 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006853 if (PyBytes_Check(repunicode)) {
6854 /* Directly copy bytes result to output. */
6855 repsize = PyBytes_Size(repunicode);
6856 if (repsize > 1) {
6857 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006858 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006859 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6860 Py_DECREF(repunicode);
6861 goto onError;
6862 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006863 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006864 ressize += repsize-1;
6865 }
6866 memcpy(str, PyBytes_AsString(repunicode), repsize);
6867 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006868 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006869 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006870 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006871 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 /* need more space? (at least enough for what we
6873 have+the replacement+the rest of the string, so
6874 we won't have to check space for encodable characters) */
6875 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006876 repsize = PyUnicode_GET_LENGTH(repunicode);
6877 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 if (requiredsize > ressize) {
6879 if (requiredsize<2*ressize)
6880 requiredsize = 2*ressize;
6881 if (_PyBytes_Resize(&res, requiredsize)) {
6882 Py_DECREF(repunicode);
6883 goto onError;
6884 }
6885 str = PyBytes_AS_STRING(res) + respos;
6886 ressize = requiredsize;
6887 }
6888 /* check if there is anything unencodable in the replacement
6889 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006890 for (i = 0; repsize-->0; ++i, ++str) {
6891 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006893 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006894 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 Py_DECREF(repunicode);
6896 goto onError;
6897 }
6898 *str = (char)c;
6899 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006900 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006901 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006902 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006903 }
6904 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006905 /* Resize if we allocated to much */
6906 size = str - PyBytes_AS_STRING(res);
6907 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006908 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006909 if (_PyBytes_Resize(&res, size) < 0)
6910 goto onError;
6911 }
6912
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006913 Py_XDECREF(errorHandler);
6914 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006915 return res;
6916
6917 onError:
6918 Py_XDECREF(res);
6919 Py_XDECREF(errorHandler);
6920 Py_XDECREF(exc);
6921 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006922}
6923
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006924/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006925PyObject *
6926PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006927 Py_ssize_t size,
6928 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006929{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006930 PyObject *result;
6931 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6932 if (unicode == NULL)
6933 return NULL;
6934 result = unicode_encode_ucs1(unicode, errors, 256);
6935 Py_DECREF(unicode);
6936 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937}
6938
Alexander Belopolsky40018472011-02-26 01:02:56 +00006939PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006940_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006941{
6942 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 PyErr_BadArgument();
6944 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006946 if (PyUnicode_READY(unicode) == -1)
6947 return NULL;
6948 /* Fast path: if it is a one-byte string, construct
6949 bytes object directly. */
6950 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6951 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6952 PyUnicode_GET_LENGTH(unicode));
6953 /* Non-Latin-1 characters present. Defer to above function to
6954 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006955 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006956}
6957
6958PyObject*
6959PyUnicode_AsLatin1String(PyObject *unicode)
6960{
6961 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962}
6963
6964/* --- 7-bit ASCII Codec -------------------------------------------------- */
6965
Alexander Belopolsky40018472011-02-26 01:02:56 +00006966PyObject *
6967PyUnicode_DecodeASCII(const char *s,
6968 Py_ssize_t size,
6969 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006971 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006972 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006973 int kind;
6974 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006975 Py_ssize_t startinpos;
6976 Py_ssize_t endinpos;
6977 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006978 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006979 int has_error;
6980 const unsigned char *p = (const unsigned char *)s;
6981 const unsigned char *end = p + size;
6982 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006983 PyObject *errorHandler = NULL;
6984 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006985
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006986 if (size == 0) {
6987 Py_INCREF(unicode_empty);
6988 return unicode_empty;
6989 }
6990
Guido van Rossumd57fd912000-03-10 22:53:23 +00006991 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006992 if (size == 1 && (unsigned char)s[0] < 128)
6993 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006994
Victor Stinner702c7342011-10-05 13:50:52 +02006995 has_error = 0;
6996 while (p < end && !has_error) {
6997 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6998 an explanation. */
6999 if (!((size_t) p & LONG_PTR_MASK)) {
7000 /* Help register allocation */
7001 register const unsigned char *_p = p;
7002 while (_p < aligned_end) {
7003 unsigned long value = *(unsigned long *) _p;
7004 if (value & ASCII_CHAR_MASK) {
7005 has_error = 1;
7006 break;
7007 }
7008 _p += SIZEOF_LONG;
7009 }
7010 if (_p == end)
7011 break;
7012 if (has_error)
7013 break;
7014 p = _p;
7015 }
7016 if (*p & 0x80) {
7017 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007018 break;
Victor Stinner702c7342011-10-05 13:50:52 +02007019 }
7020 else {
7021 ++p;
7022 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007023 }
Victor Stinner702c7342011-10-05 13:50:52 +02007024 if (!has_error)
7025 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00007026
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007027 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007030 kind = PyUnicode_KIND(v);
7031 data = PyUnicode_DATA(v);
7032 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007033 e = s + size;
7034 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 register unsigned char c = (unsigned char)*s;
7036 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007037 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00007038 ++s;
7039 }
7040 else {
7041 startinpos = s-starts;
7042 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 if (unicode_decode_call_errorhandler(
7044 errors, &errorHandler,
7045 "ascii", "ordinal not in range(128)",
7046 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007047 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00007048 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007049 kind = PyUnicode_KIND(v);
7050 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007053 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007054 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007055 Py_XDECREF(errorHandler);
7056 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007057 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007058 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007059
Benjamin Peterson29060642009-01-31 22:14:21 +00007060 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007062 Py_XDECREF(errorHandler);
7063 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064 return NULL;
7065}
7066
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007067/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007068PyObject *
7069PyUnicode_EncodeASCII(const Py_UNICODE *p,
7070 Py_ssize_t size,
7071 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007073 PyObject *result;
7074 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7075 if (unicode == NULL)
7076 return NULL;
7077 result = unicode_encode_ucs1(unicode, errors, 128);
7078 Py_DECREF(unicode);
7079 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080}
7081
Alexander Belopolsky40018472011-02-26 01:02:56 +00007082PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007083_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084{
7085 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007086 PyErr_BadArgument();
7087 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007089 if (PyUnicode_READY(unicode) == -1)
7090 return NULL;
7091 /* Fast path: if it is an ASCII-only string, construct bytes object
7092 directly. Else defer to above function to raise the exception. */
7093 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
7094 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7095 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007096 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007097}
7098
7099PyObject *
7100PyUnicode_AsASCIIString(PyObject *unicode)
7101{
7102 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103}
7104
Victor Stinner99b95382011-07-04 14:23:54 +02007105#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007106
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007107/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007108
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007109#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110#define NEED_RETRY
7111#endif
7112
Victor Stinner3a50e702011-10-18 21:21:00 +02007113#ifndef WC_ERR_INVALID_CHARS
7114# define WC_ERR_INVALID_CHARS 0x0080
7115#endif
7116
7117static char*
7118code_page_name(UINT code_page, PyObject **obj)
7119{
7120 *obj = NULL;
7121 if (code_page == CP_ACP)
7122 return "mbcs";
7123 if (code_page == CP_UTF7)
7124 return "CP_UTF7";
7125 if (code_page == CP_UTF8)
7126 return "CP_UTF8";
7127
7128 *obj = PyBytes_FromFormat("cp%u", code_page);
7129 if (*obj == NULL)
7130 return NULL;
7131 return PyBytes_AS_STRING(*obj);
7132}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007133
Alexander Belopolsky40018472011-02-26 01:02:56 +00007134static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007135is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007136{
7137 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007139
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 if (!IsDBCSLeadByteEx(code_page, *curr))
7141 return 0;
7142
7143 prev = CharPrevExA(code_page, s, curr, 0);
7144 if (prev == curr)
7145 return 1;
7146 /* FIXME: This code is limited to "true" double-byte encodings,
7147 as it assumes an incomplete character consists of a single
7148 byte. */
7149 if (curr - prev == 2)
7150 return 1;
7151 if (!IsDBCSLeadByteEx(code_page, *prev))
7152 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007153 return 0;
7154}
7155
Victor Stinner3a50e702011-10-18 21:21:00 +02007156static DWORD
7157decode_code_page_flags(UINT code_page)
7158{
7159 if (code_page == CP_UTF7) {
7160 /* The CP_UTF7 decoder only supports flags=0 */
7161 return 0;
7162 }
7163 else
7164 return MB_ERR_INVALID_CHARS;
7165}
7166
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007167/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 * Decode a byte string from a Windows code page into unicode object in strict
7169 * mode.
7170 *
7171 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7172 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007173 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007174static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007175decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007176 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007177 const char *in,
7178 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007179{
Victor Stinner3a50e702011-10-18 21:21:00 +02007180 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007181 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007183
7184 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007185 assert(insize > 0);
7186 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7187 if (outsize <= 0)
7188 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007189
7190 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007191 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007192 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007193 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007194 if (*v == NULL)
7195 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007196 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007197 }
7198 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007199 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007200 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007201 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007203 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007204 }
7205
7206 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007207 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7208 if (outsize <= 0)
7209 goto error;
7210 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007211
Victor Stinner3a50e702011-10-18 21:21:00 +02007212error:
7213 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7214 return -2;
7215 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007216 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007217}
7218
Victor Stinner3a50e702011-10-18 21:21:00 +02007219/*
7220 * Decode a byte string from a code page into unicode object with an error
7221 * handler.
7222 *
7223 * Returns consumed size if succeed, or raise a WindowsError or
7224 * UnicodeDecodeError exception and returns -1 on error.
7225 */
7226static int
7227decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007228 PyObject **v,
7229 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 const char *errors)
7231{
7232 const char *startin = in;
7233 const char *endin = in + size;
7234 const DWORD flags = decode_code_page_flags(code_page);
7235 /* Ideally, we should get reason from FormatMessage. This is the Windows
7236 2000 English version of the message. */
7237 const char *reason = "No mapping for the Unicode character exists "
7238 "in the target code page.";
7239 /* each step cannot decode more than 1 character, but a character can be
7240 represented as a surrogate pair */
7241 wchar_t buffer[2], *startout, *out;
7242 int insize, outsize;
7243 PyObject *errorHandler = NULL;
7244 PyObject *exc = NULL;
7245 PyObject *encoding_obj = NULL;
7246 char *encoding;
7247 DWORD err;
7248 int ret = -1;
7249
7250 assert(size > 0);
7251
7252 encoding = code_page_name(code_page, &encoding_obj);
7253 if (encoding == NULL)
7254 return -1;
7255
7256 if (errors == NULL || strcmp(errors, "strict") == 0) {
7257 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7258 UnicodeDecodeError. */
7259 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7260 if (exc != NULL) {
7261 PyCodec_StrictErrors(exc);
7262 Py_CLEAR(exc);
7263 }
7264 goto error;
7265 }
7266
7267 if (*v == NULL) {
7268 /* Create unicode object */
7269 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7270 PyErr_NoMemory();
7271 goto error;
7272 }
Victor Stinnerab595942011-12-17 04:59:06 +01007273 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007274 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007275 if (*v == NULL)
7276 goto error;
7277 startout = PyUnicode_AS_UNICODE(*v);
7278 }
7279 else {
7280 /* Extend unicode object */
7281 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7282 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7283 PyErr_NoMemory();
7284 goto error;
7285 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007286 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007287 goto error;
7288 startout = PyUnicode_AS_UNICODE(*v) + n;
7289 }
7290
7291 /* Decode the byte string character per character */
7292 out = startout;
7293 while (in < endin)
7294 {
7295 /* Decode a character */
7296 insize = 1;
7297 do
7298 {
7299 outsize = MultiByteToWideChar(code_page, flags,
7300 in, insize,
7301 buffer, Py_ARRAY_LENGTH(buffer));
7302 if (outsize > 0)
7303 break;
7304 err = GetLastError();
7305 if (err != ERROR_NO_UNICODE_TRANSLATION
7306 && err != ERROR_INSUFFICIENT_BUFFER)
7307 {
7308 PyErr_SetFromWindowsErr(0);
7309 goto error;
7310 }
7311 insize++;
7312 }
7313 /* 4=maximum length of a UTF-8 sequence */
7314 while (insize <= 4 && (in + insize) <= endin);
7315
7316 if (outsize <= 0) {
7317 Py_ssize_t startinpos, endinpos, outpos;
7318
7319 startinpos = in - startin;
7320 endinpos = startinpos + 1;
7321 outpos = out - PyUnicode_AS_UNICODE(*v);
7322 if (unicode_decode_call_errorhandler(
7323 errors, &errorHandler,
7324 encoding, reason,
7325 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007326 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007327 {
7328 goto error;
7329 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007330 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007331 }
7332 else {
7333 in += insize;
7334 memcpy(out, buffer, outsize * sizeof(wchar_t));
7335 out += outsize;
7336 }
7337 }
7338
7339 /* write a NUL character at the end */
7340 *out = 0;
7341
7342 /* Extend unicode object */
7343 outsize = out - startout;
7344 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007345 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007346 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007347 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007348
7349error:
7350 Py_XDECREF(encoding_obj);
7351 Py_XDECREF(errorHandler);
7352 Py_XDECREF(exc);
7353 return ret;
7354}
7355
Victor Stinner3a50e702011-10-18 21:21:00 +02007356static PyObject *
7357decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007358 const char *s, Py_ssize_t size,
7359 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007360{
Victor Stinner76a31a62011-11-04 00:05:13 +01007361 PyObject *v = NULL;
7362 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007363
Victor Stinner3a50e702011-10-18 21:21:00 +02007364 if (code_page < 0) {
7365 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7366 return NULL;
7367 }
7368
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007369 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007371
Victor Stinner76a31a62011-11-04 00:05:13 +01007372 do
7373 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007374#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007375 if (size > INT_MAX) {
7376 chunk_size = INT_MAX;
7377 final = 0;
7378 done = 0;
7379 }
7380 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007381#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007382 {
7383 chunk_size = (int)size;
7384 final = (consumed == NULL);
7385 done = 1;
7386 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007387
Victor Stinner76a31a62011-11-04 00:05:13 +01007388 /* Skip trailing lead-byte unless 'final' is set */
7389 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7390 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007391
Victor Stinner76a31a62011-11-04 00:05:13 +01007392 if (chunk_size == 0 && done) {
7393 if (v != NULL)
7394 break;
7395 Py_INCREF(unicode_empty);
7396 return unicode_empty;
7397 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007398
Victor Stinner76a31a62011-11-04 00:05:13 +01007399
7400 converted = decode_code_page_strict(code_page, &v,
7401 s, chunk_size);
7402 if (converted == -2)
7403 converted = decode_code_page_errors(code_page, &v,
7404 s, chunk_size,
7405 errors);
7406 assert(converted != 0);
7407
7408 if (converted < 0) {
7409 Py_XDECREF(v);
7410 return NULL;
7411 }
7412
7413 if (consumed)
7414 *consumed += converted;
7415
7416 s += converted;
7417 size -= converted;
7418 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007419
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007420 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007421}
7422
Alexander Belopolsky40018472011-02-26 01:02:56 +00007423PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007424PyUnicode_DecodeCodePageStateful(int code_page,
7425 const char *s,
7426 Py_ssize_t size,
7427 const char *errors,
7428 Py_ssize_t *consumed)
7429{
7430 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7431}
7432
7433PyObject *
7434PyUnicode_DecodeMBCSStateful(const char *s,
7435 Py_ssize_t size,
7436 const char *errors,
7437 Py_ssize_t *consumed)
7438{
7439 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7440}
7441
7442PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007443PyUnicode_DecodeMBCS(const char *s,
7444 Py_ssize_t size,
7445 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007446{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007447 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7448}
7449
Victor Stinner3a50e702011-10-18 21:21:00 +02007450static DWORD
7451encode_code_page_flags(UINT code_page, const char *errors)
7452{
7453 if (code_page == CP_UTF8) {
7454 if (winver.dwMajorVersion >= 6)
7455 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7456 and later */
7457 return WC_ERR_INVALID_CHARS;
7458 else
7459 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7460 return 0;
7461 }
7462 else if (code_page == CP_UTF7) {
7463 /* CP_UTF7 only supports flags=0 */
7464 return 0;
7465 }
7466 else {
7467 if (errors != NULL && strcmp(errors, "replace") == 0)
7468 return 0;
7469 else
7470 return WC_NO_BEST_FIT_CHARS;
7471 }
7472}
7473
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007474/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 * Encode a Unicode string to a Windows code page into a byte string in strict
7476 * mode.
7477 *
7478 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7479 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007480 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007481static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007482encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007483 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007485{
Victor Stinner554f3f02010-06-16 23:33:54 +00007486 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007487 BOOL *pusedDefaultChar = &usedDefaultChar;
7488 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007489 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007490 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007491 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 const DWORD flags = encode_code_page_flags(code_page, NULL);
7493 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007494 /* Create a substring so that we can get the UTF-16 representation
7495 of just the slice under consideration. */
7496 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007497
Martin v. Löwis3d325192011-11-04 18:23:06 +01007498 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007499
Victor Stinner3a50e702011-10-18 21:21:00 +02007500 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007501 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007503 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007504
Victor Stinner2fc507f2011-11-04 20:06:39 +01007505 substring = PyUnicode_Substring(unicode, offset, offset+len);
7506 if (substring == NULL)
7507 return -1;
7508 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7509 if (p == NULL) {
7510 Py_DECREF(substring);
7511 return -1;
7512 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007513
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007514 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007515 outsize = WideCharToMultiByte(code_page, flags,
7516 p, size,
7517 NULL, 0,
7518 NULL, pusedDefaultChar);
7519 if (outsize <= 0)
7520 goto error;
7521 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007522 if (pusedDefaultChar && *pusedDefaultChar) {
7523 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007524 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007525 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007526
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007529 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007530 if (*outbytes == NULL) {
7531 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007532 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007533 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007534 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007535 }
7536 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007537 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007538 const Py_ssize_t n = PyBytes_Size(*outbytes);
7539 if (outsize > PY_SSIZE_T_MAX - n) {
7540 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007541 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007542 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007543 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007544 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7545 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007546 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007547 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007548 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007549 }
7550
7551 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007552 outsize = WideCharToMultiByte(code_page, flags,
7553 p, size,
7554 out, outsize,
7555 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007556 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007557 if (outsize <= 0)
7558 goto error;
7559 if (pusedDefaultChar && *pusedDefaultChar)
7560 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007561 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007562
Victor Stinner3a50e702011-10-18 21:21:00 +02007563error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007564 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007565 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7566 return -2;
7567 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007568 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007569}
7570
Victor Stinner3a50e702011-10-18 21:21:00 +02007571/*
7572 * Encode a Unicode string to a Windows code page into a byte string using a
7573 * error handler.
7574 *
7575 * Returns consumed characters if succeed, or raise a WindowsError and returns
7576 * -1 on other error.
7577 */
7578static int
7579encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007580 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007581 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007582{
Victor Stinner3a50e702011-10-18 21:21:00 +02007583 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007584 Py_ssize_t pos = unicode_offset;
7585 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007586 /* Ideally, we should get reason from FormatMessage. This is the Windows
7587 2000 English version of the message. */
7588 const char *reason = "invalid character";
7589 /* 4=maximum length of a UTF-8 sequence */
7590 char buffer[4];
7591 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7592 Py_ssize_t outsize;
7593 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 PyObject *errorHandler = NULL;
7595 PyObject *exc = NULL;
7596 PyObject *encoding_obj = NULL;
7597 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007598 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007599 PyObject *rep;
7600 int ret = -1;
7601
7602 assert(insize > 0);
7603
7604 encoding = code_page_name(code_page, &encoding_obj);
7605 if (encoding == NULL)
7606 return -1;
7607
7608 if (errors == NULL || strcmp(errors, "strict") == 0) {
7609 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7610 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007611 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007612 if (exc != NULL) {
7613 PyCodec_StrictErrors(exc);
7614 Py_DECREF(exc);
7615 }
7616 Py_XDECREF(encoding_obj);
7617 return -1;
7618 }
7619
7620 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7621 pusedDefaultChar = &usedDefaultChar;
7622 else
7623 pusedDefaultChar = NULL;
7624
7625 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7626 PyErr_NoMemory();
7627 goto error;
7628 }
7629 outsize = insize * Py_ARRAY_LENGTH(buffer);
7630
7631 if (*outbytes == NULL) {
7632 /* Create string object */
7633 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7634 if (*outbytes == NULL)
7635 goto error;
7636 out = PyBytes_AS_STRING(*outbytes);
7637 }
7638 else {
7639 /* Extend string object */
7640 Py_ssize_t n = PyBytes_Size(*outbytes);
7641 if (n > PY_SSIZE_T_MAX - outsize) {
7642 PyErr_NoMemory();
7643 goto error;
7644 }
7645 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7646 goto error;
7647 out = PyBytes_AS_STRING(*outbytes) + n;
7648 }
7649
7650 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007651 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007653 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7654 wchar_t chars[2];
7655 int charsize;
7656 if (ch < 0x10000) {
7657 chars[0] = (wchar_t)ch;
7658 charsize = 1;
7659 }
7660 else {
7661 ch -= 0x10000;
7662 chars[0] = 0xd800 + (ch >> 10);
7663 chars[1] = 0xdc00 + (ch & 0x3ff);
7664 charsize = 2;
7665 }
7666
Victor Stinner3a50e702011-10-18 21:21:00 +02007667 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007668 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007669 buffer, Py_ARRAY_LENGTH(buffer),
7670 NULL, pusedDefaultChar);
7671 if (outsize > 0) {
7672 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7673 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007674 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007675 memcpy(out, buffer, outsize);
7676 out += outsize;
7677 continue;
7678 }
7679 }
7680 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7681 PyErr_SetFromWindowsErr(0);
7682 goto error;
7683 }
7684
Victor Stinner3a50e702011-10-18 21:21:00 +02007685 rep = unicode_encode_call_errorhandler(
7686 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007687 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007688 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007689 if (rep == NULL)
7690 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007691 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007692
7693 if (PyBytes_Check(rep)) {
7694 outsize = PyBytes_GET_SIZE(rep);
7695 if (outsize != 1) {
7696 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7697 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7698 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7699 Py_DECREF(rep);
7700 goto error;
7701 }
7702 out = PyBytes_AS_STRING(*outbytes) + offset;
7703 }
7704 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7705 out += outsize;
7706 }
7707 else {
7708 Py_ssize_t i;
7709 enum PyUnicode_Kind kind;
7710 void *data;
7711
Benjamin Petersonbac79492012-01-14 13:34:47 -05007712 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007713 Py_DECREF(rep);
7714 goto error;
7715 }
7716
7717 outsize = PyUnicode_GET_LENGTH(rep);
7718 if (outsize != 1) {
7719 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7720 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7721 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7722 Py_DECREF(rep);
7723 goto error;
7724 }
7725 out = PyBytes_AS_STRING(*outbytes) + offset;
7726 }
7727 kind = PyUnicode_KIND(rep);
7728 data = PyUnicode_DATA(rep);
7729 for (i=0; i < outsize; i++) {
7730 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7731 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007732 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007733 encoding, unicode,
7734 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007735 "unable to encode error handler result to ASCII");
7736 Py_DECREF(rep);
7737 goto error;
7738 }
7739 *out = (unsigned char)ch;
7740 out++;
7741 }
7742 }
7743 Py_DECREF(rep);
7744 }
7745 /* write a NUL byte */
7746 *out = 0;
7747 outsize = out - PyBytes_AS_STRING(*outbytes);
7748 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7749 if (_PyBytes_Resize(outbytes, outsize) < 0)
7750 goto error;
7751 ret = 0;
7752
7753error:
7754 Py_XDECREF(encoding_obj);
7755 Py_XDECREF(errorHandler);
7756 Py_XDECREF(exc);
7757 return ret;
7758}
7759
Victor Stinner3a50e702011-10-18 21:21:00 +02007760static PyObject *
7761encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007762 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007763 const char *errors)
7764{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007765 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007766 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007767 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007768 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007769
Benjamin Petersonbac79492012-01-14 13:34:47 -05007770 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007771 return NULL;
7772 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007773
Victor Stinner3a50e702011-10-18 21:21:00 +02007774 if (code_page < 0) {
7775 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7776 return NULL;
7777 }
7778
Martin v. Löwis3d325192011-11-04 18:23:06 +01007779 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007780 return PyBytes_FromStringAndSize(NULL, 0);
7781
Victor Stinner7581cef2011-11-03 22:32:33 +01007782 offset = 0;
7783 do
7784 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007785#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007786 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007787 chunks. */
7788 if (len > INT_MAX/2) {
7789 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007790 done = 0;
7791 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007792 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007793#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007794 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007795 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007796 done = 1;
7797 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007798
Victor Stinner76a31a62011-11-04 00:05:13 +01007799 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007800 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007801 errors);
7802 if (ret == -2)
7803 ret = encode_code_page_errors(code_page, &outbytes,
7804 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007805 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007806 if (ret < 0) {
7807 Py_XDECREF(outbytes);
7808 return NULL;
7809 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007810
Victor Stinner7581cef2011-11-03 22:32:33 +01007811 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007812 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007813 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007814
Victor Stinner3a50e702011-10-18 21:21:00 +02007815 return outbytes;
7816}
7817
7818PyObject *
7819PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7820 Py_ssize_t size,
7821 const char *errors)
7822{
Victor Stinner7581cef2011-11-03 22:32:33 +01007823 PyObject *unicode, *res;
7824 unicode = PyUnicode_FromUnicode(p, size);
7825 if (unicode == NULL)
7826 return NULL;
7827 res = encode_code_page(CP_ACP, unicode, errors);
7828 Py_DECREF(unicode);
7829 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007830}
7831
7832PyObject *
7833PyUnicode_EncodeCodePage(int code_page,
7834 PyObject *unicode,
7835 const char *errors)
7836{
Victor Stinner7581cef2011-11-03 22:32:33 +01007837 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007838}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007839
Alexander Belopolsky40018472011-02-26 01:02:56 +00007840PyObject *
7841PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007842{
7843 if (!PyUnicode_Check(unicode)) {
7844 PyErr_BadArgument();
7845 return NULL;
7846 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007847 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007848}
7849
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007850#undef NEED_RETRY
7851
Victor Stinner99b95382011-07-04 14:23:54 +02007852#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007853
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854/* --- Character Mapping Codec -------------------------------------------- */
7855
Alexander Belopolsky40018472011-02-26 01:02:56 +00007856PyObject *
7857PyUnicode_DecodeCharmap(const char *s,
7858 Py_ssize_t size,
7859 PyObject *mapping,
7860 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007862 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007863 Py_ssize_t startinpos;
7864 Py_ssize_t endinpos;
7865 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007866 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007867 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007868 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007869 PyObject *errorHandler = NULL;
7870 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007871
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872 /* Default to Latin-1 */
7873 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007875
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007876 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007878 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007880 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007881 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007882 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007883 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007884 Py_ssize_t maplen;
7885 enum PyUnicode_Kind kind;
7886 void *data;
7887 Py_UCS4 x;
7888
Benjamin Petersonbac79492012-01-14 13:34:47 -05007889 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007890 return NULL;
7891
7892 maplen = PyUnicode_GET_LENGTH(mapping);
7893 data = PyUnicode_DATA(mapping);
7894 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 while (s < e) {
7896 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007897
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007899 x = PyUnicode_READ(kind, data, ch);
7900 else
7901 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007903 if (x == 0xfffe)
7904 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 startinpos = s-starts;
7907 endinpos = startinpos+1;
7908 if (unicode_decode_call_errorhandler(
7909 errors, &errorHandler,
7910 "charmap", "character maps to <undefined>",
7911 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007912 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007913 goto onError;
7914 }
7915 continue;
7916 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007917
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007918 if (unicode_putchar(&v, &outpos, x) < 0)
7919 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007921 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007922 }
7923 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007924 while (s < e) {
7925 unsigned char ch = *s;
7926 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007927
Benjamin Peterson29060642009-01-31 22:14:21 +00007928 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7929 w = PyLong_FromLong((long)ch);
7930 if (w == NULL)
7931 goto onError;
7932 x = PyObject_GetItem(mapping, w);
7933 Py_DECREF(w);
7934 if (x == NULL) {
7935 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7936 /* No mapping found means: mapping is undefined. */
7937 PyErr_Clear();
7938 x = Py_None;
7939 Py_INCREF(x);
7940 } else
7941 goto onError;
7942 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007943
Benjamin Peterson29060642009-01-31 22:14:21 +00007944 /* Apply mapping */
7945 if (PyLong_Check(x)) {
7946 long value = PyLong_AS_LONG(x);
7947 if (value < 0 || value > 65535) {
7948 PyErr_SetString(PyExc_TypeError,
7949 "character mapping must be in range(65536)");
7950 Py_DECREF(x);
7951 goto onError;
7952 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007953 if (unicode_putchar(&v, &outpos, value) < 0)
7954 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 }
7956 else if (x == Py_None) {
7957 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007958 startinpos = s-starts;
7959 endinpos = startinpos+1;
7960 if (unicode_decode_call_errorhandler(
7961 errors, &errorHandler,
7962 "charmap", "character maps to <undefined>",
7963 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007964 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 Py_DECREF(x);
7966 goto onError;
7967 }
7968 Py_DECREF(x);
7969 continue;
7970 }
7971 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007972 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007973
Benjamin Petersonbac79492012-01-14 13:34:47 -05007974 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007975 goto onError;
7976 targetsize = PyUnicode_GET_LENGTH(x);
7977
7978 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007980 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007981 PyUnicode_READ_CHAR(x, 0)) < 0)
7982 goto onError;
7983 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 else if (targetsize > 1) {
7985 /* 1-n mapping */
7986 if (targetsize > extrachars) {
7987 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 Py_ssize_t needed = (targetsize - extrachars) + \
7989 (targetsize << 2);
7990 extrachars += needed;
7991 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007992 if (unicode_resize(&v,
7993 PyUnicode_GET_LENGTH(v) + needed) < 0)
7994 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 Py_DECREF(x);
7996 goto onError;
7997 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 }
Victor Stinner1b487b42012-05-03 12:29:04 +02007999 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01008000 goto onError;
8001 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
8002 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 extrachars -= targetsize;
8004 }
8005 /* 1-0 mapping: skip the character */
8006 }
8007 else {
8008 /* wrong return value */
8009 PyErr_SetString(PyExc_TypeError,
8010 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008011 Py_DECREF(x);
8012 goto onError;
8013 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 Py_DECREF(x);
8015 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008017 }
Victor Stinner16e6a802011-12-12 13:24:15 +01008018 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01008019 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008020 Py_XDECREF(errorHandler);
8021 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008022 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00008023
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008025 Py_XDECREF(errorHandler);
8026 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 Py_XDECREF(v);
8028 return NULL;
8029}
8030
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008031/* Charmap encoding: the lookup table */
8032
Alexander Belopolsky40018472011-02-26 01:02:56 +00008033struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 PyObject_HEAD
8035 unsigned char level1[32];
8036 int count2, count3;
8037 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008038};
8039
8040static PyObject*
8041encoding_map_size(PyObject *obj, PyObject* args)
8042{
8043 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008044 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008046}
8047
8048static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008049 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 PyDoc_STR("Return the size (in bytes) of this object") },
8051 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008052};
8053
8054static void
8055encoding_map_dealloc(PyObject* o)
8056{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008057 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008058}
8059
8060static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008061 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 "EncodingMap", /*tp_name*/
8063 sizeof(struct encoding_map), /*tp_basicsize*/
8064 0, /*tp_itemsize*/
8065 /* methods */
8066 encoding_map_dealloc, /*tp_dealloc*/
8067 0, /*tp_print*/
8068 0, /*tp_getattr*/
8069 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008070 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 0, /*tp_repr*/
8072 0, /*tp_as_number*/
8073 0, /*tp_as_sequence*/
8074 0, /*tp_as_mapping*/
8075 0, /*tp_hash*/
8076 0, /*tp_call*/
8077 0, /*tp_str*/
8078 0, /*tp_getattro*/
8079 0, /*tp_setattro*/
8080 0, /*tp_as_buffer*/
8081 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8082 0, /*tp_doc*/
8083 0, /*tp_traverse*/
8084 0, /*tp_clear*/
8085 0, /*tp_richcompare*/
8086 0, /*tp_weaklistoffset*/
8087 0, /*tp_iter*/
8088 0, /*tp_iternext*/
8089 encoding_map_methods, /*tp_methods*/
8090 0, /*tp_members*/
8091 0, /*tp_getset*/
8092 0, /*tp_base*/
8093 0, /*tp_dict*/
8094 0, /*tp_descr_get*/
8095 0, /*tp_descr_set*/
8096 0, /*tp_dictoffset*/
8097 0, /*tp_init*/
8098 0, /*tp_alloc*/
8099 0, /*tp_new*/
8100 0, /*tp_free*/
8101 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008102};
8103
8104PyObject*
8105PyUnicode_BuildEncodingMap(PyObject* string)
8106{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008107 PyObject *result;
8108 struct encoding_map *mresult;
8109 int i;
8110 int need_dict = 0;
8111 unsigned char level1[32];
8112 unsigned char level2[512];
8113 unsigned char *mlevel1, *mlevel2, *mlevel3;
8114 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008115 int kind;
8116 void *data;
8117 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008120 PyErr_BadArgument();
8121 return NULL;
8122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008123 kind = PyUnicode_KIND(string);
8124 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008125 memset(level1, 0xFF, sizeof level1);
8126 memset(level2, 0xFF, sizeof level2);
8127
8128 /* If there isn't a one-to-one mapping of NULL to \0,
8129 or if there are non-BMP characters, we need to use
8130 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008131 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008132 need_dict = 1;
8133 for (i = 1; i < 256; i++) {
8134 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008135 ch = PyUnicode_READ(kind, data, i);
8136 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008137 need_dict = 1;
8138 break;
8139 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008140 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008141 /* unmapped character */
8142 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008143 l1 = ch >> 11;
8144 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008145 if (level1[l1] == 0xFF)
8146 level1[l1] = count2++;
8147 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008148 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008149 }
8150
8151 if (count2 >= 0xFF || count3 >= 0xFF)
8152 need_dict = 1;
8153
8154 if (need_dict) {
8155 PyObject *result = PyDict_New();
8156 PyObject *key, *value;
8157 if (!result)
8158 return NULL;
8159 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008160 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008161 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008162 if (!key || !value)
8163 goto failed1;
8164 if (PyDict_SetItem(result, key, value) == -1)
8165 goto failed1;
8166 Py_DECREF(key);
8167 Py_DECREF(value);
8168 }
8169 return result;
8170 failed1:
8171 Py_XDECREF(key);
8172 Py_XDECREF(value);
8173 Py_DECREF(result);
8174 return NULL;
8175 }
8176
8177 /* Create a three-level trie */
8178 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8179 16*count2 + 128*count3 - 1);
8180 if (!result)
8181 return PyErr_NoMemory();
8182 PyObject_Init(result, &EncodingMapType);
8183 mresult = (struct encoding_map*)result;
8184 mresult->count2 = count2;
8185 mresult->count3 = count3;
8186 mlevel1 = mresult->level1;
8187 mlevel2 = mresult->level23;
8188 mlevel3 = mresult->level23 + 16*count2;
8189 memcpy(mlevel1, level1, 32);
8190 memset(mlevel2, 0xFF, 16*count2);
8191 memset(mlevel3, 0, 128*count3);
8192 count3 = 0;
8193 for (i = 1; i < 256; i++) {
8194 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008195 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008196 /* unmapped character */
8197 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008198 o1 = PyUnicode_READ(kind, data, i)>>11;
8199 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008200 i2 = 16*mlevel1[o1] + o2;
8201 if (mlevel2[i2] == 0xFF)
8202 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008203 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008204 i3 = 128*mlevel2[i2] + o3;
8205 mlevel3[i3] = i;
8206 }
8207 return result;
8208}
8209
8210static int
Victor Stinner22168992011-11-20 17:09:18 +01008211encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008212{
8213 struct encoding_map *map = (struct encoding_map*)mapping;
8214 int l1 = c>>11;
8215 int l2 = (c>>7) & 0xF;
8216 int l3 = c & 0x7F;
8217 int i;
8218
Victor Stinner22168992011-11-20 17:09:18 +01008219 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008221 if (c == 0)
8222 return 0;
8223 /* level 1*/
8224 i = map->level1[l1];
8225 if (i == 0xFF) {
8226 return -1;
8227 }
8228 /* level 2*/
8229 i = map->level23[16*i+l2];
8230 if (i == 0xFF) {
8231 return -1;
8232 }
8233 /* level 3 */
8234 i = map->level23[16*map->count2 + 128*i + l3];
8235 if (i == 0) {
8236 return -1;
8237 }
8238 return i;
8239}
8240
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008241/* Lookup the character ch in the mapping. If the character
8242 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008243 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008244static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008245charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008246{
Christian Heimes217cfd12007-12-02 14:31:20 +00008247 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008248 PyObject *x;
8249
8250 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008252 x = PyObject_GetItem(mapping, w);
8253 Py_DECREF(w);
8254 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8256 /* No mapping found means: mapping is undefined. */
8257 PyErr_Clear();
8258 x = Py_None;
8259 Py_INCREF(x);
8260 return x;
8261 } else
8262 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008264 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008265 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008266 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 long value = PyLong_AS_LONG(x);
8268 if (value < 0 || value > 255) {
8269 PyErr_SetString(PyExc_TypeError,
8270 "character mapping must be in range(256)");
8271 Py_DECREF(x);
8272 return NULL;
8273 }
8274 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008276 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008279 /* wrong return value */
8280 PyErr_Format(PyExc_TypeError,
8281 "character mapping must return integer, bytes or None, not %.400s",
8282 x->ob_type->tp_name);
8283 Py_DECREF(x);
8284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 }
8286}
8287
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008288static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008289charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008290{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008291 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8292 /* exponentially overallocate to minimize reallocations */
8293 if (requiredsize < 2*outsize)
8294 requiredsize = 2*outsize;
8295 if (_PyBytes_Resize(outobj, requiredsize))
8296 return -1;
8297 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008298}
8299
Benjamin Peterson14339b62009-01-31 16:36:08 +00008300typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008302} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008303/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008304 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 space is available. Return a new reference to the object that
8306 was put in the output buffer, or Py_None, if the mapping was undefined
8307 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008308 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008309static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008310charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008311 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008312{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008313 PyObject *rep;
8314 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008315 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008316
Christian Heimes90aa7642007-12-19 02:45:37 +00008317 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008318 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008320 if (res == -1)
8321 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 if (outsize<requiredsize)
8323 if (charmapencode_resize(outobj, outpos, requiredsize))
8324 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008325 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 outstart[(*outpos)++] = (char)res;
8327 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008328 }
8329
8330 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 Py_DECREF(rep);
8335 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008336 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 if (PyLong_Check(rep)) {
8338 Py_ssize_t requiredsize = *outpos+1;
8339 if (outsize<requiredsize)
8340 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8341 Py_DECREF(rep);
8342 return enc_EXCEPTION;
8343 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008344 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008346 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008347 else {
8348 const char *repchars = PyBytes_AS_STRING(rep);
8349 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8350 Py_ssize_t requiredsize = *outpos+repsize;
8351 if (outsize<requiredsize)
8352 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8353 Py_DECREF(rep);
8354 return enc_EXCEPTION;
8355 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008356 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 memcpy(outstart + *outpos, repchars, repsize);
8358 *outpos += repsize;
8359 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008361 Py_DECREF(rep);
8362 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363}
8364
8365/* handle an error in PyUnicode_EncodeCharmap
8366 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008367static int
8368charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008369 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008371 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008372 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008373{
8374 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008375 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008376 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008377 enum PyUnicode_Kind kind;
8378 void *data;
8379 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008381 Py_ssize_t collstartpos = *inpos;
8382 Py_ssize_t collendpos = *inpos+1;
8383 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008384 char *encoding = "charmap";
8385 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008386 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008387 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008388 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389
Benjamin Petersonbac79492012-01-14 13:34:47 -05008390 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008391 return -1;
8392 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008393 /* find all unencodable characters */
8394 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008395 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008396 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008397 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008398 val = encoding_map_lookup(ch, mapping);
8399 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 break;
8401 ++collendpos;
8402 continue;
8403 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008404
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008405 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8406 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 if (rep==NULL)
8408 return -1;
8409 else if (rep!=Py_None) {
8410 Py_DECREF(rep);
8411 break;
8412 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008413 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 }
8416 /* cache callback name lookup
8417 * (if not done yet, i.e. it's the first error) */
8418 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 if ((errors==NULL) || (!strcmp(errors, "strict")))
8420 *known_errorHandler = 1;
8421 else if (!strcmp(errors, "replace"))
8422 *known_errorHandler = 2;
8423 else if (!strcmp(errors, "ignore"))
8424 *known_errorHandler = 3;
8425 else if (!strcmp(errors, "xmlcharrefreplace"))
8426 *known_errorHandler = 4;
8427 else
8428 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429 }
8430 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008431 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008432 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008433 return -1;
8434 case 2: /* replace */
8435 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 x = charmapencode_output('?', mapping, res, respos);
8437 if (x==enc_EXCEPTION) {
8438 return -1;
8439 }
8440 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008441 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 return -1;
8443 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008444 }
8445 /* fall through */
8446 case 3: /* ignore */
8447 *inpos = collendpos;
8448 break;
8449 case 4: /* xmlcharrefreplace */
8450 /* generate replacement (temporarily (mis)uses p) */
8451 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 char buffer[2+29+1+1];
8453 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008454 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 for (cp = buffer; *cp; ++cp) {
8456 x = charmapencode_output(*cp, mapping, res, respos);
8457 if (x==enc_EXCEPTION)
8458 return -1;
8459 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008460 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 return -1;
8462 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008463 }
8464 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008465 *inpos = collendpos;
8466 break;
8467 default:
8468 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008469 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008471 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008473 if (PyBytes_Check(repunicode)) {
8474 /* Directly copy bytes result to output. */
8475 Py_ssize_t outsize = PyBytes_Size(*res);
8476 Py_ssize_t requiredsize;
8477 repsize = PyBytes_Size(repunicode);
8478 requiredsize = *respos + repsize;
8479 if (requiredsize > outsize)
8480 /* Make room for all additional bytes. */
8481 if (charmapencode_resize(res, respos, requiredsize)) {
8482 Py_DECREF(repunicode);
8483 return -1;
8484 }
8485 memcpy(PyBytes_AsString(*res) + *respos,
8486 PyBytes_AsString(repunicode), repsize);
8487 *respos += repsize;
8488 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008489 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008490 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008491 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008492 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008493 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008494 Py_DECREF(repunicode);
8495 return -1;
8496 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008497 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008498 data = PyUnicode_DATA(repunicode);
8499 kind = PyUnicode_KIND(repunicode);
8500 for (index = 0; index < repsize; index++) {
8501 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8502 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008504 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 return -1;
8506 }
8507 else if (x==enc_FAILED) {
8508 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008509 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 return -1;
8511 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008512 }
8513 *inpos = newpos;
8514 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 }
8516 return 0;
8517}
8518
Alexander Belopolsky40018472011-02-26 01:02:56 +00008519PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008520_PyUnicode_EncodeCharmap(PyObject *unicode,
8521 PyObject *mapping,
8522 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008523{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524 /* output object */
8525 PyObject *res = NULL;
8526 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008527 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008528 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008529 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008530 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531 PyObject *errorHandler = NULL;
8532 PyObject *exc = NULL;
8533 /* the following variable is used for caching string comparisons
8534 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8535 * 3=ignore, 4=xmlcharrefreplace */
8536 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008537
Benjamin Petersonbac79492012-01-14 13:34:47 -05008538 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008539 return NULL;
8540 size = PyUnicode_GET_LENGTH(unicode);
8541
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542 /* Default to Latin-1 */
8543 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008544 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008545
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 /* allocate enough for a simple encoding without
8547 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008548 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008549 if (res == NULL)
8550 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008551 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008554 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008555 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008557 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008558 if (x==enc_EXCEPTION) /* error */
8559 goto onError;
8560 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008561 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 &exc,
8563 &known_errorHandler, &errorHandler, errors,
8564 &res, &respos)) {
8565 goto onError;
8566 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008567 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 else
8569 /* done with this character => adjust input position */
8570 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008573 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008574 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008575 if (_PyBytes_Resize(&res, respos) < 0)
8576 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008577
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008578 Py_XDECREF(exc);
8579 Py_XDECREF(errorHandler);
8580 return res;
8581
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008583 Py_XDECREF(res);
8584 Py_XDECREF(exc);
8585 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586 return NULL;
8587}
8588
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008589/* Deprecated */
8590PyObject *
8591PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8592 Py_ssize_t size,
8593 PyObject *mapping,
8594 const char *errors)
8595{
8596 PyObject *result;
8597 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8598 if (unicode == NULL)
8599 return NULL;
8600 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8601 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008602 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008603}
8604
Alexander Belopolsky40018472011-02-26 01:02:56 +00008605PyObject *
8606PyUnicode_AsCharmapString(PyObject *unicode,
8607 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608{
8609 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 PyErr_BadArgument();
8611 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008612 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008613 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614}
8615
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008616/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008617static void
8618make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008620 Py_ssize_t startpos, Py_ssize_t endpos,
8621 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 *exceptionObject = _PyUnicodeTranslateError_Create(
8625 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008626 }
8627 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8629 goto onError;
8630 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8631 goto onError;
8632 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8633 goto onError;
8634 return;
8635 onError:
8636 Py_DECREF(*exceptionObject);
8637 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638 }
8639}
8640
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008641/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008642static void
8643raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008645 Py_ssize_t startpos, Py_ssize_t endpos,
8646 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647{
8648 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008650 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008652}
8653
8654/* error handling callback helper:
8655 build arguments, call the callback and check the arguments,
8656 put the result into newpos and return the replacement string, which
8657 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008658static PyObject *
8659unicode_translate_call_errorhandler(const char *errors,
8660 PyObject **errorHandler,
8661 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008663 Py_ssize_t startpos, Py_ssize_t endpos,
8664 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008665{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008666 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008667
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008668 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 PyObject *restuple;
8670 PyObject *resunicode;
8671
8672 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008674 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 }
8677
8678 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682
8683 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008688 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008689 Py_DECREF(restuple);
8690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008691 }
8692 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008693 &resunicode, &i_newpos)) {
8694 Py_DECREF(restuple);
8695 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008697 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008699 else
8700 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8703 Py_DECREF(restuple);
8704 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008705 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008706 Py_INCREF(resunicode);
8707 Py_DECREF(restuple);
8708 return resunicode;
8709}
8710
8711/* Lookup the character ch in the mapping and put the result in result,
8712 which must be decrefed by the caller.
8713 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008714static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716{
Christian Heimes217cfd12007-12-02 14:31:20 +00008717 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718 PyObject *x;
8719
8720 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008722 x = PyObject_GetItem(mapping, w);
8723 Py_DECREF(w);
8724 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8726 /* No mapping found means: use 1:1 mapping. */
8727 PyErr_Clear();
8728 *result = NULL;
8729 return 0;
8730 } else
8731 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732 }
8733 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 *result = x;
8735 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008736 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008737 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008738 long value = PyLong_AS_LONG(x);
8739 long max = PyUnicode_GetMax();
8740 if (value < 0 || value > max) {
8741 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008742 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 Py_DECREF(x);
8744 return -1;
8745 }
8746 *result = x;
8747 return 0;
8748 }
8749 else if (PyUnicode_Check(x)) {
8750 *result = x;
8751 return 0;
8752 }
8753 else {
8754 /* wrong return value */
8755 PyErr_SetString(PyExc_TypeError,
8756 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008757 Py_DECREF(x);
8758 return -1;
8759 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008760}
8761/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 if not reallocate and adjust various state variables.
8763 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008764static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008767{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008768 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008769 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 /* exponentially overallocate to minimize reallocations */
8771 if (requiredsize < 2 * oldsize)
8772 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8774 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008775 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008776 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008777 }
8778 return 0;
8779}
8780/* lookup the character, put the result in the output string and adjust
8781 various state variables. Return a new reference to the object that
8782 was put in the output buffer in *result, or Py_None, if the mapping was
8783 undefined (in which case no character was written).
8784 The called must decref result.
8785 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008786static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8788 PyObject *mapping, Py_UCS4 **output,
8789 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008790 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008791{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008792 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8793 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008794 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008795 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008798 }
8799 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008801 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008802 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008804 }
8805 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 Py_ssize_t repsize;
8807 if (PyUnicode_READY(*res) == -1)
8808 return -1;
8809 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008810 if (repsize==1) {
8811 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008812 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008813 }
8814 else if (repsize!=0) {
8815 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816 Py_ssize_t requiredsize = *opos +
8817 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008818 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819 Py_ssize_t i;
8820 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822 for(i = 0; i < repsize; i++)
8823 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008825 }
8826 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008828 return 0;
8829}
8830
Alexander Belopolsky40018472011-02-26 01:02:56 +00008831PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832_PyUnicode_TranslateCharmap(PyObject *input,
8833 PyObject *mapping,
8834 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008835{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 /* input object */
8837 char *idata;
8838 Py_ssize_t size, i;
8839 int kind;
8840 /* output buffer */
8841 Py_UCS4 *output = NULL;
8842 Py_ssize_t osize;
8843 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008844 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008846 char *reason = "character maps to <undefined>";
8847 PyObject *errorHandler = NULL;
8848 PyObject *exc = NULL;
8849 /* the following variable is used for caching string comparisons
8850 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8851 * 3=ignore, 4=xmlcharrefreplace */
8852 int known_errorHandler = -1;
8853
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008855 PyErr_BadArgument();
8856 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008859 if (PyUnicode_READY(input) == -1)
8860 return NULL;
8861 idata = (char*)PyUnicode_DATA(input);
8862 kind = PyUnicode_KIND(input);
8863 size = PyUnicode_GET_LENGTH(input);
8864 i = 0;
8865
8866 if (size == 0) {
8867 Py_INCREF(input);
8868 return input;
8869 }
8870
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008871 /* allocate enough for a simple 1:1 translation without
8872 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873 osize = size;
8874 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8875 opos = 0;
8876 if (output == NULL) {
8877 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008878 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008879 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008881 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 /* try to encode it */
8883 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008884 if (charmaptranslate_output(input, i, mapping,
8885 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 Py_XDECREF(x);
8887 goto onError;
8888 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008889 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008891 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008892 else { /* untranslatable character */
8893 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8894 Py_ssize_t repsize;
8895 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008896 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 Py_ssize_t collstart = i;
8899 Py_ssize_t collend = i+1;
8900 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008901
Benjamin Peterson29060642009-01-31 22:14:21 +00008902 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 while (collend < size) {
8904 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008905 goto onError;
8906 Py_XDECREF(x);
8907 if (x!=Py_None)
8908 break;
8909 ++collend;
8910 }
8911 /* cache callback name lookup
8912 * (if not done yet, i.e. it's the first error) */
8913 if (known_errorHandler==-1) {
8914 if ((errors==NULL) || (!strcmp(errors, "strict")))
8915 known_errorHandler = 1;
8916 else if (!strcmp(errors, "replace"))
8917 known_errorHandler = 2;
8918 else if (!strcmp(errors, "ignore"))
8919 known_errorHandler = 3;
8920 else if (!strcmp(errors, "xmlcharrefreplace"))
8921 known_errorHandler = 4;
8922 else
8923 known_errorHandler = 0;
8924 }
8925 switch (known_errorHandler) {
8926 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008927 raise_translate_exception(&exc, input, collstart,
8928 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008929 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008930 case 2: /* replace */
8931 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932 for (coll = collstart; coll<collend; coll++)
8933 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 /* fall through */
8935 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008936 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008937 break;
8938 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008939 /* generate replacement (temporarily (mis)uses i) */
8940 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008941 char buffer[2+29+1+1];
8942 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8944 if (charmaptranslate_makespace(&output, &osize,
8945 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 goto onError;
8947 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008951 break;
8952 default:
8953 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954 reason, input, &exc,
8955 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008956 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008957 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008958 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008959 Py_DECREF(repunicode);
8960 goto onError;
8961 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008962 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008963 repsize = PyUnicode_GET_LENGTH(repunicode);
8964 if (charmaptranslate_makespace(&output, &osize,
8965 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008966 Py_DECREF(repunicode);
8967 goto onError;
8968 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969 for (uni2 = 0; repsize-->0; ++uni2)
8970 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8971 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008972 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008973 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008974 }
8975 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8977 if (!res)
8978 goto onError;
8979 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008980 Py_XDECREF(exc);
8981 Py_XDECREF(errorHandler);
8982 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983
Benjamin Peterson29060642009-01-31 22:14:21 +00008984 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008986 Py_XDECREF(exc);
8987 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988 return NULL;
8989}
8990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991/* Deprecated. Use PyUnicode_Translate instead. */
8992PyObject *
8993PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8994 Py_ssize_t size,
8995 PyObject *mapping,
8996 const char *errors)
8997{
8998 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8999 if (!unicode)
9000 return NULL;
9001 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9002}
9003
Alexander Belopolsky40018472011-02-26 01:02:56 +00009004PyObject *
9005PyUnicode_Translate(PyObject *str,
9006 PyObject *mapping,
9007 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008{
9009 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00009010
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011 str = PyUnicode_FromObject(str);
9012 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009013 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015 Py_DECREF(str);
9016 return result;
Tim Petersced69f82003-09-16 20:30:58 +00009017
Benjamin Peterson29060642009-01-31 22:14:21 +00009018 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 Py_XDECREF(str);
9020 return NULL;
9021}
Tim Petersced69f82003-09-16 20:30:58 +00009022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009023static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009024fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009025{
9026 /* No need to call PyUnicode_READY(self) because this function is only
9027 called as a callback from fixup() which does it already. */
9028 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9029 const int kind = PyUnicode_KIND(self);
9030 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009031 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009032 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009033 Py_ssize_t i;
9034
9035 for (i = 0; i < len; ++i) {
9036 ch = PyUnicode_READ(kind, data, i);
9037 fixed = 0;
9038 if (ch > 127) {
9039 if (Py_UNICODE_ISSPACE(ch))
9040 fixed = ' ';
9041 else {
9042 const int decimal = Py_UNICODE_TODECIMAL(ch);
9043 if (decimal >= 0)
9044 fixed = '0' + decimal;
9045 }
9046 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009047 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02009048 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009049 PyUnicode_WRITE(kind, data, i, fixed);
9050 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009051 else
9052 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054 }
9055
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009056 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057}
9058
9059PyObject *
9060_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9061{
9062 if (!PyUnicode_Check(unicode)) {
9063 PyErr_BadInternalCall();
9064 return NULL;
9065 }
9066 if (PyUnicode_READY(unicode) == -1)
9067 return NULL;
9068 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9069 /* If the string is already ASCII, just return the same string */
9070 Py_INCREF(unicode);
9071 return unicode;
9072 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009073 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074}
9075
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009076PyObject *
9077PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9078 Py_ssize_t length)
9079{
Victor Stinnerf0124502011-11-21 23:12:56 +01009080 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009081 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009082 Py_UCS4 maxchar;
9083 enum PyUnicode_Kind kind;
9084 void *data;
9085
Victor Stinner99d7ad02012-02-22 13:37:39 +01009086 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009087 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01009088 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009089 if (ch > 127) {
9090 int decimal = Py_UNICODE_TODECIMAL(ch);
9091 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009092 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02009093 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009094 }
9095 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009096
9097 /* Copy to a new string */
9098 decimal = PyUnicode_New(length, maxchar);
9099 if (decimal == NULL)
9100 return decimal;
9101 kind = PyUnicode_KIND(decimal);
9102 data = PyUnicode_DATA(decimal);
9103 /* Iterate over code points */
9104 for (i = 0; i < length; i++) {
9105 Py_UNICODE ch = s[i];
9106 if (ch > 127) {
9107 int decimal = Py_UNICODE_TODECIMAL(ch);
9108 if (decimal >= 0)
9109 ch = '0' + decimal;
9110 }
9111 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009113 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009114}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009115/* --- Decimal Encoder ---------------------------------------------------- */
9116
Alexander Belopolsky40018472011-02-26 01:02:56 +00009117int
9118PyUnicode_EncodeDecimal(Py_UNICODE *s,
9119 Py_ssize_t length,
9120 char *output,
9121 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009122{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009123 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009124 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009125 enum PyUnicode_Kind kind;
9126 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009127
9128 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009129 PyErr_BadArgument();
9130 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009131 }
9132
Victor Stinner42bf7752011-11-21 22:52:58 +01009133 unicode = PyUnicode_FromUnicode(s, length);
9134 if (unicode == NULL)
9135 return -1;
9136
Benjamin Petersonbac79492012-01-14 13:34:47 -05009137 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009138 Py_DECREF(unicode);
9139 return -1;
9140 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009141 kind = PyUnicode_KIND(unicode);
9142 data = PyUnicode_DATA(unicode);
9143
Victor Stinnerb84d7232011-11-22 01:50:07 +01009144 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009145 PyObject *exc;
9146 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009148 Py_ssize_t startpos;
9149
9150 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009151
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009153 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009154 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009156 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009157 decimal = Py_UNICODE_TODECIMAL(ch);
9158 if (decimal >= 0) {
9159 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009160 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009161 continue;
9162 }
9163 if (0 < ch && ch < 256) {
9164 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009165 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009166 continue;
9167 }
Victor Stinner6345be92011-11-25 20:09:01 +01009168
Victor Stinner42bf7752011-11-21 22:52:58 +01009169 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009170 exc = NULL;
9171 raise_encode_exception(&exc, "decimal", unicode,
9172 startpos, startpos+1,
9173 "invalid decimal Unicode string");
9174 Py_XDECREF(exc);
9175 Py_DECREF(unicode);
9176 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009177 }
9178 /* 0-terminate the output string */
9179 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009180 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009181 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009182}
9183
Guido van Rossumd57fd912000-03-10 22:53:23 +00009184/* --- Helpers ------------------------------------------------------------ */
9185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009186static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009187any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009188 Py_ssize_t start,
9189 Py_ssize_t end)
9190{
9191 int kind1, kind2, kind;
9192 void *buf1, *buf2;
9193 Py_ssize_t len1, len2, result;
9194
9195 kind1 = PyUnicode_KIND(s1);
9196 kind2 = PyUnicode_KIND(s2);
9197 kind = kind1 > kind2 ? kind1 : kind2;
9198 buf1 = PyUnicode_DATA(s1);
9199 buf2 = PyUnicode_DATA(s2);
9200 if (kind1 != kind)
9201 buf1 = _PyUnicode_AsKind(s1, kind);
9202 if (!buf1)
9203 return -2;
9204 if (kind2 != kind)
9205 buf2 = _PyUnicode_AsKind(s2, kind);
9206 if (!buf2) {
9207 if (kind1 != kind) PyMem_Free(buf1);
9208 return -2;
9209 }
9210 len1 = PyUnicode_GET_LENGTH(s1);
9211 len2 = PyUnicode_GET_LENGTH(s2);
9212
Victor Stinner794d5672011-10-10 03:21:36 +02009213 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009214 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009215 case PyUnicode_1BYTE_KIND:
9216 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9217 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9218 else
9219 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9220 break;
9221 case PyUnicode_2BYTE_KIND:
9222 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9223 break;
9224 case PyUnicode_4BYTE_KIND:
9225 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9226 break;
9227 default:
9228 assert(0); result = -2;
9229 }
9230 }
9231 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009232 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009233 case PyUnicode_1BYTE_KIND:
9234 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9235 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9236 else
9237 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9238 break;
9239 case PyUnicode_2BYTE_KIND:
9240 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9241 break;
9242 case PyUnicode_4BYTE_KIND:
9243 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9244 break;
9245 default:
9246 assert(0); result = -2;
9247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248 }
9249
9250 if (kind1 != kind)
9251 PyMem_Free(buf1);
9252 if (kind2 != kind)
9253 PyMem_Free(buf2);
9254
9255 return result;
9256}
9257
9258Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009259_PyUnicode_InsertThousandsGrouping(
9260 PyObject *unicode, Py_ssize_t index,
9261 Py_ssize_t n_buffer,
9262 void *digits, Py_ssize_t n_digits,
9263 Py_ssize_t min_width,
9264 const char *grouping, PyObject *thousands_sep,
9265 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266{
Victor Stinner41a863c2012-02-24 00:37:51 +01009267 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009268 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009269 Py_ssize_t thousands_sep_len;
9270 Py_ssize_t len;
9271
9272 if (unicode != NULL) {
9273 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009274 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009275 }
9276 else {
9277 kind = PyUnicode_1BYTE_KIND;
9278 data = NULL;
9279 }
9280 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9281 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9282 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9283 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009284 if (thousands_sep_kind < kind) {
9285 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9286 if (!thousands_sep_data)
9287 return -1;
9288 }
9289 else {
9290 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9291 if (!data)
9292 return -1;
9293 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009294 }
9295
Benjamin Petersonead6b532011-12-20 17:23:42 -06009296 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009298 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009299 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009300 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009301 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009302 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009303 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009304 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009305 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009306 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009307 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009308 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009309 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009310 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009311 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009312 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009313 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009314 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009316 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009317 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009318 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009319 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009320 break;
9321 default:
9322 assert(0);
9323 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009325 if (unicode != NULL && thousands_sep_kind != kind) {
9326 if (thousands_sep_kind < kind)
9327 PyMem_Free(thousands_sep_data);
9328 else
9329 PyMem_Free(data);
9330 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009331 if (unicode == NULL) {
9332 *maxchar = 127;
9333 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009334 *maxchar = MAX_MAXCHAR(*maxchar,
9335 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009336 }
9337 }
9338 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339}
9340
9341
Thomas Wouters477c8d52006-05-27 19:21:47 +00009342/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009343#define ADJUST_INDICES(start, end, len) \
9344 if (end > len) \
9345 end = len; \
9346 else if (end < 0) { \
9347 end += len; \
9348 if (end < 0) \
9349 end = 0; \
9350 } \
9351 if (start < 0) { \
9352 start += len; \
9353 if (start < 0) \
9354 start = 0; \
9355 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009356
Alexander Belopolsky40018472011-02-26 01:02:56 +00009357Py_ssize_t
9358PyUnicode_Count(PyObject *str,
9359 PyObject *substr,
9360 Py_ssize_t start,
9361 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009363 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009364 PyObject* str_obj;
9365 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009366 int kind1, kind2, kind;
9367 void *buf1 = NULL, *buf2 = NULL;
9368 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009369
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009370 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009371 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009373 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009374 if (!sub_obj) {
9375 Py_DECREF(str_obj);
9376 return -1;
9377 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009378 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009379 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009380 Py_DECREF(str_obj);
9381 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009382 }
Tim Petersced69f82003-09-16 20:30:58 +00009383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009384 kind1 = PyUnicode_KIND(str_obj);
9385 kind2 = PyUnicode_KIND(sub_obj);
9386 kind = kind1 > kind2 ? kind1 : kind2;
9387 buf1 = PyUnicode_DATA(str_obj);
9388 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009389 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390 if (!buf1)
9391 goto onError;
9392 buf2 = PyUnicode_DATA(sub_obj);
9393 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009394 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 if (!buf2)
9396 goto onError;
9397 len1 = PyUnicode_GET_LENGTH(str_obj);
9398 len2 = PyUnicode_GET_LENGTH(sub_obj);
9399
9400 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009401 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009403 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9404 result = asciilib_count(
9405 ((Py_UCS1*)buf1) + start, end - start,
9406 buf2, len2, PY_SSIZE_T_MAX
9407 );
9408 else
9409 result = ucs1lib_count(
9410 ((Py_UCS1*)buf1) + start, end - start,
9411 buf2, len2, PY_SSIZE_T_MAX
9412 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 break;
9414 case PyUnicode_2BYTE_KIND:
9415 result = ucs2lib_count(
9416 ((Py_UCS2*)buf1) + start, end - start,
9417 buf2, len2, PY_SSIZE_T_MAX
9418 );
9419 break;
9420 case PyUnicode_4BYTE_KIND:
9421 result = ucs4lib_count(
9422 ((Py_UCS4*)buf1) + start, end - start,
9423 buf2, len2, PY_SSIZE_T_MAX
9424 );
9425 break;
9426 default:
9427 assert(0); result = 0;
9428 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009429
9430 Py_DECREF(sub_obj);
9431 Py_DECREF(str_obj);
9432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 if (kind1 != kind)
9434 PyMem_Free(buf1);
9435 if (kind2 != kind)
9436 PyMem_Free(buf2);
9437
Guido van Rossumd57fd912000-03-10 22:53:23 +00009438 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 onError:
9440 Py_DECREF(sub_obj);
9441 Py_DECREF(str_obj);
9442 if (kind1 != kind && buf1)
9443 PyMem_Free(buf1);
9444 if (kind2 != kind && buf2)
9445 PyMem_Free(buf2);
9446 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447}
9448
Alexander Belopolsky40018472011-02-26 01:02:56 +00009449Py_ssize_t
9450PyUnicode_Find(PyObject *str,
9451 PyObject *sub,
9452 Py_ssize_t start,
9453 Py_ssize_t end,
9454 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009456 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009457
Guido van Rossumd57fd912000-03-10 22:53:23 +00009458 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009459 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009460 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009461 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009462 if (!sub) {
9463 Py_DECREF(str);
9464 return -2;
9465 }
9466 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9467 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009468 Py_DECREF(str);
9469 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470 }
Tim Petersced69f82003-09-16 20:30:58 +00009471
Victor Stinner794d5672011-10-10 03:21:36 +02009472 result = any_find_slice(direction,
9473 str, sub, start, end
9474 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009475
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009477 Py_DECREF(sub);
9478
Guido van Rossumd57fd912000-03-10 22:53:23 +00009479 return result;
9480}
9481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482Py_ssize_t
9483PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9484 Py_ssize_t start, Py_ssize_t end,
9485 int direction)
9486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009488 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 if (PyUnicode_READY(str) == -1)
9490 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009491 if (start < 0 || end < 0) {
9492 PyErr_SetString(PyExc_IndexError, "string index out of range");
9493 return -2;
9494 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 if (end > PyUnicode_GET_LENGTH(str))
9496 end = PyUnicode_GET_LENGTH(str);
9497 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009498 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9499 kind, end-start, ch, direction);
9500 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009502 else
9503 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504}
9505
Alexander Belopolsky40018472011-02-26 01:02:56 +00009506static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009507tailmatch(PyObject *self,
9508 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009509 Py_ssize_t start,
9510 Py_ssize_t end,
9511 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 int kind_self;
9514 int kind_sub;
9515 void *data_self;
9516 void *data_sub;
9517 Py_ssize_t offset;
9518 Py_ssize_t i;
9519 Py_ssize_t end_sub;
9520
9521 if (PyUnicode_READY(self) == -1 ||
9522 PyUnicode_READY(substring) == -1)
9523 return 0;
9524
9525 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009526 return 1;
9527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009528 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9529 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009530 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009531 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533 kind_self = PyUnicode_KIND(self);
9534 data_self = PyUnicode_DATA(self);
9535 kind_sub = PyUnicode_KIND(substring);
9536 data_sub = PyUnicode_DATA(substring);
9537 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9538
9539 if (direction > 0)
9540 offset = end;
9541 else
9542 offset = start;
9543
9544 if (PyUnicode_READ(kind_self, data_self, offset) ==
9545 PyUnicode_READ(kind_sub, data_sub, 0) &&
9546 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9547 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9548 /* If both are of the same kind, memcmp is sufficient */
9549 if (kind_self == kind_sub) {
9550 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009551 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 data_sub,
9553 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009554 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 }
9556 /* otherwise we have to compare each character by first accesing it */
9557 else {
9558 /* We do not need to compare 0 and len(substring)-1 because
9559 the if statement above ensured already that they are equal
9560 when we end up here. */
9561 // TODO: honor direction and do a forward or backwards search
9562 for (i = 1; i < end_sub; ++i) {
9563 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9564 PyUnicode_READ(kind_sub, data_sub, i))
9565 return 0;
9566 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009567 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009569 }
9570
9571 return 0;
9572}
9573
Alexander Belopolsky40018472011-02-26 01:02:56 +00009574Py_ssize_t
9575PyUnicode_Tailmatch(PyObject *str,
9576 PyObject *substr,
9577 Py_ssize_t start,
9578 Py_ssize_t end,
9579 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009581 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009582
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583 str = PyUnicode_FromObject(str);
9584 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009585 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586 substr = PyUnicode_FromObject(substr);
9587 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009588 Py_DECREF(str);
9589 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590 }
Tim Petersced69f82003-09-16 20:30:58 +00009591
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009592 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009593 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009594 Py_DECREF(str);
9595 Py_DECREF(substr);
9596 return result;
9597}
9598
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599/* Apply fixfct filter to the Unicode object self and return a
9600 reference to the modified object */
9601
Alexander Belopolsky40018472011-02-26 01:02:56 +00009602static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009603fixup(PyObject *self,
9604 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 PyObject *u;
9607 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009608 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009610 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009612 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009613 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009615 /* fix functions return the new maximum character in a string,
9616 if the kind of the resulting unicode object does not change,
9617 everything is fine. Otherwise we need to change the string kind
9618 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009619 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009620
9621 if (maxchar_new == 0) {
9622 /* no changes */;
9623 if (PyUnicode_CheckExact(self)) {
9624 Py_DECREF(u);
9625 Py_INCREF(self);
9626 return self;
9627 }
9628 else
9629 return u;
9630 }
9631
Victor Stinnere6abb482012-05-02 01:15:40 +02009632 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633
Victor Stinnereaab6042011-12-11 22:22:39 +01009634 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009636
9637 /* In case the maximum character changed, we need to
9638 convert the string to the new category. */
9639 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9640 if (v == NULL) {
9641 Py_DECREF(u);
9642 return NULL;
9643 }
9644 if (maxchar_new > maxchar_old) {
9645 /* If the maxchar increased so that the kind changed, not all
9646 characters are representable anymore and we need to fix the
9647 string again. This only happens in very few cases. */
9648 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9649 maxchar_old = fixfct(v);
9650 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 }
9652 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009653 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009655 Py_DECREF(u);
9656 assert(_PyUnicode_CheckConsistency(v, 1));
9657 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658}
9659
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009660static PyObject *
9661ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009663 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9664 char *resdata, *data = PyUnicode_DATA(self);
9665 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009666
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009667 res = PyUnicode_New(len, 127);
9668 if (res == NULL)
9669 return NULL;
9670 resdata = PyUnicode_DATA(res);
9671 if (lower)
9672 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009674 _Py_bytes_upper(resdata, data, len);
9675 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676}
9677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009679handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009681 Py_ssize_t j;
9682 int final_sigma;
9683 Py_UCS4 c;
9684 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009685
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009686 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9687
9688 where ! is a negation and \p{xxx} is a character with property xxx.
9689 */
9690 for (j = i - 1; j >= 0; j--) {
9691 c = PyUnicode_READ(kind, data, j);
9692 if (!_PyUnicode_IsCaseIgnorable(c))
9693 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009695 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9696 if (final_sigma) {
9697 for (j = i + 1; j < length; j++) {
9698 c = PyUnicode_READ(kind, data, j);
9699 if (!_PyUnicode_IsCaseIgnorable(c))
9700 break;
9701 }
9702 final_sigma = j == length || !_PyUnicode_IsCased(c);
9703 }
9704 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705}
9706
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009707static int
9708lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9709 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 /* Obscure special case. */
9712 if (c == 0x3A3) {
9713 mapped[0] = handle_capital_sigma(kind, data, length, i);
9714 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009715 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009716 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717}
9718
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009719static Py_ssize_t
9720do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009722 Py_ssize_t i, k = 0;
9723 int n_res, j;
9724 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009725
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009726 c = PyUnicode_READ(kind, data, 0);
9727 n_res = _PyUnicode_ToUpperFull(c, mapped);
9728 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009729 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009730 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009732 for (i = 1; i < length; i++) {
9733 c = PyUnicode_READ(kind, data, i);
9734 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9735 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009736 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009737 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009738 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009739 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009740 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009741}
9742
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009743static Py_ssize_t
9744do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9745 Py_ssize_t i, k = 0;
9746
9747 for (i = 0; i < length; i++) {
9748 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9749 int n_res, j;
9750 if (Py_UNICODE_ISUPPER(c)) {
9751 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9752 }
9753 else if (Py_UNICODE_ISLOWER(c)) {
9754 n_res = _PyUnicode_ToUpperFull(c, mapped);
9755 }
9756 else {
9757 n_res = 1;
9758 mapped[0] = c;
9759 }
9760 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009761 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009762 res[k++] = mapped[j];
9763 }
9764 }
9765 return k;
9766}
9767
9768static Py_ssize_t
9769do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9770 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009772 Py_ssize_t i, k = 0;
9773
9774 for (i = 0; i < length; i++) {
9775 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9776 int n_res, j;
9777 if (lower)
9778 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9779 else
9780 n_res = _PyUnicode_ToUpperFull(c, mapped);
9781 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009782 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009783 res[k++] = mapped[j];
9784 }
9785 }
9786 return k;
9787}
9788
9789static Py_ssize_t
9790do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9791{
9792 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9793}
9794
9795static Py_ssize_t
9796do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9797{
9798 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9799}
9800
Benjamin Petersone51757f2012-01-12 21:10:29 -05009801static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009802do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9803{
9804 Py_ssize_t i, k = 0;
9805
9806 for (i = 0; i < length; i++) {
9807 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9808 Py_UCS4 mapped[3];
9809 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9810 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009811 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009812 res[k++] = mapped[j];
9813 }
9814 }
9815 return k;
9816}
9817
9818static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009819do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9820{
9821 Py_ssize_t i, k = 0;
9822 int previous_is_cased;
9823
9824 previous_is_cased = 0;
9825 for (i = 0; i < length; i++) {
9826 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9827 Py_UCS4 mapped[3];
9828 int n_res, j;
9829
9830 if (previous_is_cased)
9831 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9832 else
9833 n_res = _PyUnicode_ToTitleFull(c, mapped);
9834
9835 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009836 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009837 res[k++] = mapped[j];
9838 }
9839
9840 previous_is_cased = _PyUnicode_IsCased(c);
9841 }
9842 return k;
9843}
9844
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009845static PyObject *
9846case_operation(PyObject *self,
9847 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9848{
9849 PyObject *res = NULL;
9850 Py_ssize_t length, newlength = 0;
9851 int kind, outkind;
9852 void *data, *outdata;
9853 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9854
Benjamin Petersoneea48462012-01-16 14:28:50 -05009855 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009856
9857 kind = PyUnicode_KIND(self);
9858 data = PyUnicode_DATA(self);
9859 length = PyUnicode_GET_LENGTH(self);
9860 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9861 if (tmp == NULL)
9862 return PyErr_NoMemory();
9863 newlength = perform(kind, data, length, tmp, &maxchar);
9864 res = PyUnicode_New(newlength, maxchar);
9865 if (res == NULL)
9866 goto leave;
9867 tmpend = tmp + newlength;
9868 outdata = PyUnicode_DATA(res);
9869 outkind = PyUnicode_KIND(res);
9870 switch (outkind) {
9871 case PyUnicode_1BYTE_KIND:
9872 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9873 break;
9874 case PyUnicode_2BYTE_KIND:
9875 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9876 break;
9877 case PyUnicode_4BYTE_KIND:
9878 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9879 break;
9880 default:
9881 assert(0);
9882 break;
9883 }
9884 leave:
9885 PyMem_FREE(tmp);
9886 return res;
9887}
9888
Tim Peters8ce9f162004-08-27 01:49:32 +00009889PyObject *
9890PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009893 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009894 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009895 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009896 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9897 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009898 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009899 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009900 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009902 int use_memcpy;
9903 unsigned char *res_data = NULL, *sep_data = NULL;
9904 PyObject *last_obj;
9905 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906
Tim Peters05eba1f2004-08-27 21:32:02 +00009907 fseq = PySequence_Fast(seq, "");
9908 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009909 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009910 }
9911
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009912 /* NOTE: the following code can't call back into Python code,
9913 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009914 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009915
Tim Peters05eba1f2004-08-27 21:32:02 +00009916 seqlen = PySequence_Fast_GET_SIZE(fseq);
9917 /* If empty sequence, return u"". */
9918 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009919 Py_DECREF(fseq);
9920 Py_INCREF(unicode_empty);
9921 res = unicode_empty;
9922 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009923 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009924
Tim Peters05eba1f2004-08-27 21:32:02 +00009925 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009926 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009927 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009928 if (seqlen == 1) {
9929 if (PyUnicode_CheckExact(items[0])) {
9930 res = items[0];
9931 Py_INCREF(res);
9932 Py_DECREF(fseq);
9933 return res;
9934 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009935 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009936 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009937 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009938 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009939 /* Set up sep and seplen */
9940 if (separator == NULL) {
9941 /* fall back to a blank space separator */
9942 sep = PyUnicode_FromOrdinal(' ');
9943 if (!sep)
9944 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009945 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009946 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009947 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009948 else {
9949 if (!PyUnicode_Check(separator)) {
9950 PyErr_Format(PyExc_TypeError,
9951 "separator: expected str instance,"
9952 " %.80s found",
9953 Py_TYPE(separator)->tp_name);
9954 goto onError;
9955 }
9956 if (PyUnicode_READY(separator))
9957 goto onError;
9958 sep = separator;
9959 seplen = PyUnicode_GET_LENGTH(separator);
9960 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9961 /* inc refcount to keep this code path symmetric with the
9962 above case of a blank separator */
9963 Py_INCREF(sep);
9964 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009965 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009966 }
9967
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009968 /* There are at least two things to join, or else we have a subclass
9969 * of str in the sequence.
9970 * Do a pre-pass to figure out the total amount of space we'll
9971 * need (sz), and see whether all argument are strings.
9972 */
9973 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009974#ifdef Py_DEBUG
9975 use_memcpy = 0;
9976#else
9977 use_memcpy = 1;
9978#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009979 for (i = 0; i < seqlen; i++) {
9980 const Py_ssize_t old_sz = sz;
9981 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009982 if (!PyUnicode_Check(item)) {
9983 PyErr_Format(PyExc_TypeError,
9984 "sequence item %zd: expected str instance,"
9985 " %.80s found",
9986 i, Py_TYPE(item)->tp_name);
9987 goto onError;
9988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009989 if (PyUnicode_READY(item) == -1)
9990 goto onError;
9991 sz += PyUnicode_GET_LENGTH(item);
9992 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009993 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009994 if (i != 0)
9995 sz += seplen;
9996 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9997 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009998 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009999 goto onError;
10000 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010001 if (use_memcpy && last_obj != NULL) {
10002 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10003 use_memcpy = 0;
10004 }
10005 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010006 }
Tim Petersced69f82003-09-16 20:30:58 +000010007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010009 if (res == NULL)
10010 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010011
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010012 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010013#ifdef Py_DEBUG
10014 use_memcpy = 0;
10015#else
10016 if (use_memcpy) {
10017 res_data = PyUnicode_1BYTE_DATA(res);
10018 kind = PyUnicode_KIND(res);
10019 if (seplen != 0)
10020 sep_data = PyUnicode_1BYTE_DATA(sep);
10021 }
10022#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010024 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010025 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010026 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +020010027 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010028 if (use_memcpy) {
10029 Py_MEMCPY(res_data,
10030 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010031 kind * seplen);
10032 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010033 }
10034 else {
10035 copy_characters(res, res_offset, sep, 0, seplen);
10036 res_offset += seplen;
10037 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010038 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010039 itemlen = PyUnicode_GET_LENGTH(item);
10040 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010041 if (use_memcpy) {
10042 Py_MEMCPY(res_data,
10043 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010044 kind * itemlen);
10045 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010046 }
10047 else {
10048 copy_characters(res, res_offset, item, 0, itemlen);
10049 res_offset += itemlen;
10050 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010051 }
Tim Peters05eba1f2004-08-27 21:32:02 +000010052 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010053 if (use_memcpy)
10054 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010055 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +020010056 else
10057 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +000010058
Tim Peters05eba1f2004-08-27 21:32:02 +000010059 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010061 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010063
Benjamin Peterson29060642009-01-31 22:14:21 +000010064 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010065 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010067 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010068 return NULL;
10069}
10070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071#define FILL(kind, data, value, start, length) \
10072 do { \
10073 Py_ssize_t i_ = 0; \
10074 assert(kind != PyUnicode_WCHAR_KIND); \
10075 switch ((kind)) { \
10076 case PyUnicode_1BYTE_KIND: { \
10077 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020010078 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 break; \
10080 } \
10081 case PyUnicode_2BYTE_KIND: { \
10082 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10083 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10084 break; \
10085 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010086 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10088 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10089 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010090 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 } \
10092 } \
10093 } while (0)
10094
Victor Stinner3fe55312012-01-04 00:33:50 +010010095Py_ssize_t
10096PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10097 Py_UCS4 fill_char)
10098{
10099 Py_ssize_t maxlen;
10100 enum PyUnicode_Kind kind;
10101 void *data;
10102
10103 if (!PyUnicode_Check(unicode)) {
10104 PyErr_BadInternalCall();
10105 return -1;
10106 }
10107 if (PyUnicode_READY(unicode) == -1)
10108 return -1;
10109 if (unicode_check_modifiable(unicode))
10110 return -1;
10111
10112 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10113 PyErr_SetString(PyExc_ValueError,
10114 "fill character is bigger than "
10115 "the string maximum character");
10116 return -1;
10117 }
10118
10119 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10120 length = Py_MIN(maxlen, length);
10121 if (length <= 0)
10122 return 0;
10123
10124 kind = PyUnicode_KIND(unicode);
10125 data = PyUnicode_DATA(unicode);
10126 FILL(kind, data, fill_char, start, length);
10127 return length;
10128}
10129
Victor Stinner9310abb2011-10-05 00:59:23 +020010130static PyObject *
10131pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010132 Py_ssize_t left,
10133 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010135{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010136 PyObject *u;
10137 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010138 int kind;
10139 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140
10141 if (left < 0)
10142 left = 0;
10143 if (right < 0)
10144 right = 0;
10145
Victor Stinnerc4b49542011-12-11 22:44:26 +010010146 if (left == 0 && right == 0)
10147 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10150 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010151 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10152 return NULL;
10153 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +020010155 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010157 if (!u)
10158 return NULL;
10159
10160 kind = PyUnicode_KIND(u);
10161 data = PyUnicode_DATA(u);
10162 if (left)
10163 FILL(kind, data, fill, 0, left);
10164 if (right)
10165 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010166 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010167 assert(_PyUnicode_CheckConsistency(u, 1));
10168 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169}
10170
Alexander Belopolsky40018472011-02-26 01:02:56 +000010171PyObject *
10172PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010173{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175
10176 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010177 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010178 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010179 if (PyUnicode_READY(string) == -1) {
10180 Py_DECREF(string);
10181 return NULL;
10182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010183
Benjamin Petersonead6b532011-12-20 17:23:42 -060010184 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010186 if (PyUnicode_IS_ASCII(string))
10187 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010188 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010189 PyUnicode_GET_LENGTH(string), keepends);
10190 else
10191 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010192 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010193 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 break;
10195 case PyUnicode_2BYTE_KIND:
10196 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010197 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 PyUnicode_GET_LENGTH(string), keepends);
10199 break;
10200 case PyUnicode_4BYTE_KIND:
10201 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010202 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 PyUnicode_GET_LENGTH(string), keepends);
10204 break;
10205 default:
10206 assert(0);
10207 list = 0;
10208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209 Py_DECREF(string);
10210 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211}
10212
Alexander Belopolsky40018472011-02-26 01:02:56 +000010213static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010214split(PyObject *self,
10215 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010216 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010217{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 int kind1, kind2, kind;
10219 void *buf1, *buf2;
10220 Py_ssize_t len1, len2;
10221 PyObject* out;
10222
Guido van Rossumd57fd912000-03-10 22:53:23 +000010223 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010224 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 if (PyUnicode_READY(self) == -1)
10227 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010230 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010232 if (PyUnicode_IS_ASCII(self))
10233 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010234 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010235 PyUnicode_GET_LENGTH(self), maxcount
10236 );
10237 else
10238 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010239 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010240 PyUnicode_GET_LENGTH(self), maxcount
10241 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 case PyUnicode_2BYTE_KIND:
10243 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010244 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 PyUnicode_GET_LENGTH(self), maxcount
10246 );
10247 case PyUnicode_4BYTE_KIND:
10248 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010249 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 PyUnicode_GET_LENGTH(self), maxcount
10251 );
10252 default:
10253 assert(0);
10254 return NULL;
10255 }
10256
10257 if (PyUnicode_READY(substring) == -1)
10258 return NULL;
10259
10260 kind1 = PyUnicode_KIND(self);
10261 kind2 = PyUnicode_KIND(substring);
10262 kind = kind1 > kind2 ? kind1 : kind2;
10263 buf1 = PyUnicode_DATA(self);
10264 buf2 = PyUnicode_DATA(substring);
10265 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010266 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 if (!buf1)
10268 return NULL;
10269 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010270 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 if (!buf2) {
10272 if (kind1 != kind) PyMem_Free(buf1);
10273 return NULL;
10274 }
10275 len1 = PyUnicode_GET_LENGTH(self);
10276 len2 = PyUnicode_GET_LENGTH(substring);
10277
Benjamin Petersonead6b532011-12-20 17:23:42 -060010278 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010280 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10281 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010282 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010283 else
10284 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010285 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 break;
10287 case PyUnicode_2BYTE_KIND:
10288 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010289 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 break;
10291 case PyUnicode_4BYTE_KIND:
10292 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010293 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 break;
10295 default:
10296 out = NULL;
10297 }
10298 if (kind1 != kind)
10299 PyMem_Free(buf1);
10300 if (kind2 != kind)
10301 PyMem_Free(buf2);
10302 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303}
10304
Alexander Belopolsky40018472011-02-26 01:02:56 +000010305static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010306rsplit(PyObject *self,
10307 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010308 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010309{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 int kind1, kind2, kind;
10311 void *buf1, *buf2;
10312 Py_ssize_t len1, len2;
10313 PyObject* out;
10314
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010315 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010316 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 if (PyUnicode_READY(self) == -1)
10319 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010322 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010324 if (PyUnicode_IS_ASCII(self))
10325 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010326 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010327 PyUnicode_GET_LENGTH(self), maxcount
10328 );
10329 else
10330 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010331 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010332 PyUnicode_GET_LENGTH(self), maxcount
10333 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 case PyUnicode_2BYTE_KIND:
10335 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010336 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 PyUnicode_GET_LENGTH(self), maxcount
10338 );
10339 case PyUnicode_4BYTE_KIND:
10340 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010341 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 PyUnicode_GET_LENGTH(self), maxcount
10343 );
10344 default:
10345 assert(0);
10346 return NULL;
10347 }
10348
10349 if (PyUnicode_READY(substring) == -1)
10350 return NULL;
10351
10352 kind1 = PyUnicode_KIND(self);
10353 kind2 = PyUnicode_KIND(substring);
10354 kind = kind1 > kind2 ? kind1 : kind2;
10355 buf1 = PyUnicode_DATA(self);
10356 buf2 = PyUnicode_DATA(substring);
10357 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010358 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 if (!buf1)
10360 return NULL;
10361 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010362 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 if (!buf2) {
10364 if (kind1 != kind) PyMem_Free(buf1);
10365 return NULL;
10366 }
10367 len1 = PyUnicode_GET_LENGTH(self);
10368 len2 = PyUnicode_GET_LENGTH(substring);
10369
Benjamin Petersonead6b532011-12-20 17:23:42 -060010370 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010372 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10373 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010374 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010375 else
10376 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010377 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 break;
10379 case PyUnicode_2BYTE_KIND:
10380 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010381 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 break;
10383 case PyUnicode_4BYTE_KIND:
10384 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010385 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 break;
10387 default:
10388 out = NULL;
10389 }
10390 if (kind1 != kind)
10391 PyMem_Free(buf1);
10392 if (kind2 != kind)
10393 PyMem_Free(buf2);
10394 return out;
10395}
10396
10397static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010398anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10399 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010401 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010403 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10404 return asciilib_find(buf1, len1, buf2, len2, offset);
10405 else
10406 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 case PyUnicode_2BYTE_KIND:
10408 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10409 case PyUnicode_4BYTE_KIND:
10410 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10411 }
10412 assert(0);
10413 return -1;
10414}
10415
10416static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010417anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10418 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010420 switch (kind) {
10421 case PyUnicode_1BYTE_KIND:
10422 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10423 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10424 else
10425 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10426 case PyUnicode_2BYTE_KIND:
10427 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10428 case PyUnicode_4BYTE_KIND:
10429 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10430 }
10431 assert(0);
10432 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010433}
10434
Alexander Belopolsky40018472011-02-26 01:02:56 +000010435static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436replace(PyObject *self, PyObject *str1,
10437 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010438{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 PyObject *u;
10440 char *sbuf = PyUnicode_DATA(self);
10441 char *buf1 = PyUnicode_DATA(str1);
10442 char *buf2 = PyUnicode_DATA(str2);
10443 int srelease = 0, release1 = 0, release2 = 0;
10444 int skind = PyUnicode_KIND(self);
10445 int kind1 = PyUnicode_KIND(str1);
10446 int kind2 = PyUnicode_KIND(str2);
10447 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10448 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10449 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010450 int mayshrink;
10451 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010452
10453 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010454 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010456 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010457
Victor Stinner59de0ee2011-10-07 10:01:28 +020010458 if (str1 == str2)
10459 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 if (skind < kind1)
10461 /* substring too wide to be present */
10462 goto nothing;
10463
Victor Stinner49a0a212011-10-12 23:46:10 +020010464 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10465 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10466 /* Replacing str1 with str2 may cause a maxchar reduction in the
10467 result string. */
10468 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +020010469 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010472 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010473 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010474 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010476 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010477 Py_UCS4 u1, u2;
10478 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010479 Py_ssize_t index, pos;
10480 char *src;
10481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010483 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10484 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010485 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010488 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010490 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010492
10493 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10494 index = 0;
10495 src = sbuf;
10496 while (--maxcount)
10497 {
10498 pos++;
10499 src += pos * PyUnicode_KIND(self);
10500 slen -= pos;
10501 index += pos;
10502 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10503 if (pos < 0)
10504 break;
10505 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10506 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010507 }
10508 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 int rkind = skind;
10510 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010511 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 if (kind1 < rkind) {
10514 /* widen substring */
10515 buf1 = _PyUnicode_AsKind(str1, rkind);
10516 if (!buf1) goto error;
10517 release1 = 1;
10518 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010519 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010520 if (i < 0)
10521 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (rkind > kind2) {
10523 /* widen replacement */
10524 buf2 = _PyUnicode_AsKind(str2, rkind);
10525 if (!buf2) goto error;
10526 release2 = 1;
10527 }
10528 else if (rkind < kind2) {
10529 /* widen self and buf1 */
10530 rkind = kind2;
10531 if (release1) PyMem_Free(buf1);
10532 sbuf = _PyUnicode_AsKind(self, rkind);
10533 if (!sbuf) goto error;
10534 srelease = 1;
10535 buf1 = _PyUnicode_AsKind(str1, rkind);
10536 if (!buf1) goto error;
10537 release1 = 1;
10538 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010539 u = PyUnicode_New(slen, maxchar);
10540 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010541 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010542 assert(PyUnicode_KIND(u) == rkind);
10543 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010544
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010545 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010546 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010547 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010549 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010551
10552 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010553 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010554 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010555 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010556 if (i == -1)
10557 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010558 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010560 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010563 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010564 }
10565 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 Py_ssize_t n, i, j, ires;
10567 Py_ssize_t product, new_size;
10568 int rkind = skind;
10569 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010572 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 buf1 = _PyUnicode_AsKind(str1, rkind);
10574 if (!buf1) goto error;
10575 release1 = 1;
10576 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010577 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010578 if (n == 0)
10579 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010581 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 buf2 = _PyUnicode_AsKind(str2, rkind);
10583 if (!buf2) goto error;
10584 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010587 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 rkind = kind2;
10589 sbuf = _PyUnicode_AsKind(self, rkind);
10590 if (!sbuf) goto error;
10591 srelease = 1;
10592 if (release1) PyMem_Free(buf1);
10593 buf1 = _PyUnicode_AsKind(str1, rkind);
10594 if (!buf1) goto error;
10595 release1 = 1;
10596 }
10597 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10598 PyUnicode_GET_LENGTH(str1))); */
10599 product = n * (len2-len1);
10600 if ((product / (len2-len1)) != n) {
10601 PyErr_SetString(PyExc_OverflowError,
10602 "replace string is too long");
10603 goto error;
10604 }
10605 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010606 if (new_size == 0) {
10607 Py_INCREF(unicode_empty);
10608 u = unicode_empty;
10609 goto done;
10610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10612 PyErr_SetString(PyExc_OverflowError,
10613 "replace string is too long");
10614 goto error;
10615 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010616 u = PyUnicode_New(new_size, maxchar);
10617 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010619 assert(PyUnicode_KIND(u) == rkind);
10620 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 ires = i = 0;
10622 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010623 while (n-- > 0) {
10624 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010625 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010626 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010627 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010628 if (j == -1)
10629 break;
10630 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010631 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010632 memcpy(res + rkind * ires,
10633 sbuf + rkind * i,
10634 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010636 }
10637 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010639 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010641 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010643 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010645 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010647 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010648 memcpy(res + rkind * ires,
10649 sbuf + rkind * i,
10650 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010651 }
10652 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010653 /* interleave */
10654 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010655 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010657 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010659 if (--n <= 0)
10660 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010661 memcpy(res + rkind * ires,
10662 sbuf + rkind * i,
10663 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 ires++;
10665 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010666 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010667 memcpy(res + rkind * ires,
10668 sbuf + rkind * i,
10669 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010670 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010671 }
10672
10673 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010674 unicode_adjust_maxchar(&u);
10675 if (u == NULL)
10676 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010678
10679 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 if (srelease)
10681 PyMem_FREE(sbuf);
10682 if (release1)
10683 PyMem_FREE(buf1);
10684 if (release2)
10685 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010686 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010688
Benjamin Peterson29060642009-01-31 22:14:21 +000010689 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010690 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 if (srelease)
10692 PyMem_FREE(sbuf);
10693 if (release1)
10694 PyMem_FREE(buf1);
10695 if (release2)
10696 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010697 return unicode_result_unchanged(self);
10698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 error:
10700 if (srelease && sbuf)
10701 PyMem_FREE(sbuf);
10702 if (release1 && buf1)
10703 PyMem_FREE(buf1);
10704 if (release2 && buf2)
10705 PyMem_FREE(buf2);
10706 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707}
10708
10709/* --- Unicode Object Methods --------------------------------------------- */
10710
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010711PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010712 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713\n\
10714Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010715characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716
10717static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010718unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010720 if (PyUnicode_READY(self) == -1)
10721 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010722 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723}
10724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010725PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010726 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727\n\
10728Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010729have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730
10731static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010732unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010734 if (PyUnicode_READY(self) == -1)
10735 return NULL;
10736 if (PyUnicode_GET_LENGTH(self) == 0)
10737 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010738 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739}
10740
Benjamin Petersond5890c82012-01-14 13:23:30 -050010741PyDoc_STRVAR(casefold__doc__,
10742 "S.casefold() -> str\n\
10743\n\
10744Return a version of S suitable for caseless comparisons.");
10745
10746static PyObject *
10747unicode_casefold(PyObject *self)
10748{
10749 if (PyUnicode_READY(self) == -1)
10750 return NULL;
10751 if (PyUnicode_IS_ASCII(self))
10752 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010753 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010754}
10755
10756
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010757/* Argument converter. Coerces to a single unicode character */
10758
10759static int
10760convert_uc(PyObject *obj, void *addr)
10761{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010762 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010763 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010764
Benjamin Peterson14339b62009-01-31 16:36:08 +000010765 uniobj = PyUnicode_FromObject(obj);
10766 if (uniobj == NULL) {
10767 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010768 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010769 return 0;
10770 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010772 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010773 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010774 Py_DECREF(uniobj);
10775 return 0;
10776 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010778 Py_DECREF(uniobj);
10779 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010780}
10781
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010782PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010783 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010785Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010786done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787
10788static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010789unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010790{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010791 Py_ssize_t marg, left;
10792 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 Py_UCS4 fillchar = ' ';
10794
Victor Stinnere9a29352011-10-01 02:14:59 +020010795 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010796 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797
Benjamin Petersonbac79492012-01-14 13:34:47 -050010798 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010799 return NULL;
10800
Victor Stinnerc4b49542011-12-11 22:44:26 +010010801 if (PyUnicode_GET_LENGTH(self) >= width)
10802 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010803
Victor Stinnerc4b49542011-12-11 22:44:26 +010010804 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805 left = marg / 2 + (marg & width & 1);
10806
Victor Stinner9310abb2011-10-05 00:59:23 +020010807 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808}
10809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810/* This function assumes that str1 and str2 are readied by the caller. */
10811
Marc-André Lemburge5034372000-08-08 08:04:29 +000010812static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010813unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010814{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010815 int kind1, kind2;
10816 void *data1, *data2;
10817 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 kind1 = PyUnicode_KIND(str1);
10820 kind2 = PyUnicode_KIND(str2);
10821 data1 = PyUnicode_DATA(str1);
10822 data2 = PyUnicode_DATA(str2);
10823 len1 = PyUnicode_GET_LENGTH(str1);
10824 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010826 for (i = 0; i < len1 && i < len2; ++i) {
10827 Py_UCS4 c1, c2;
10828 c1 = PyUnicode_READ(kind1, data1, i);
10829 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010830
10831 if (c1 != c2)
10832 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010833 }
10834
10835 return (len1 < len2) ? -1 : (len1 != len2);
10836}
10837
Alexander Belopolsky40018472011-02-26 01:02:56 +000010838int
10839PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10842 if (PyUnicode_READY(left) == -1 ||
10843 PyUnicode_READY(right) == -1)
10844 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010845 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010847 PyErr_Format(PyExc_TypeError,
10848 "Can't compare %.100s and %.100s",
10849 left->ob_type->tp_name,
10850 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010851 return -1;
10852}
10853
Martin v. Löwis5b222132007-06-10 09:51:05 +000010854int
10855PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10856{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857 Py_ssize_t i;
10858 int kind;
10859 void *data;
10860 Py_UCS4 chr;
10861
Victor Stinner910337b2011-10-03 03:20:16 +020010862 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 if (PyUnicode_READY(uni) == -1)
10864 return -1;
10865 kind = PyUnicode_KIND(uni);
10866 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010867 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010868 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10869 if (chr != str[i])
10870 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010871 /* This check keeps Python strings that end in '\0' from comparing equal
10872 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010874 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010875 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010876 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010877 return 0;
10878}
10879
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010880
Benjamin Peterson29060642009-01-31 22:14:21 +000010881#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010882 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010883
Alexander Belopolsky40018472011-02-26 01:02:56 +000010884PyObject *
10885PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010886{
10887 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010888
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010889 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10890 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891 if (PyUnicode_READY(left) == -1 ||
10892 PyUnicode_READY(right) == -1)
10893 return NULL;
10894 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10895 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010896 if (op == Py_EQ) {
10897 Py_INCREF(Py_False);
10898 return Py_False;
10899 }
10900 if (op == Py_NE) {
10901 Py_INCREF(Py_True);
10902 return Py_True;
10903 }
10904 }
10905 if (left == right)
10906 result = 0;
10907 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010908 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010909
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010910 /* Convert the return value to a Boolean */
10911 switch (op) {
10912 case Py_EQ:
10913 v = TEST_COND(result == 0);
10914 break;
10915 case Py_NE:
10916 v = TEST_COND(result != 0);
10917 break;
10918 case Py_LE:
10919 v = TEST_COND(result <= 0);
10920 break;
10921 case Py_GE:
10922 v = TEST_COND(result >= 0);
10923 break;
10924 case Py_LT:
10925 v = TEST_COND(result == -1);
10926 break;
10927 case Py_GT:
10928 v = TEST_COND(result == 1);
10929 break;
10930 default:
10931 PyErr_BadArgument();
10932 return NULL;
10933 }
10934 Py_INCREF(v);
10935 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010936 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010937
Brian Curtindfc80e32011-08-10 20:28:54 -050010938 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010939}
10940
Alexander Belopolsky40018472011-02-26 01:02:56 +000010941int
10942PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010943{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010944 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010945 int kind1, kind2, kind;
10946 void *buf1, *buf2;
10947 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010948 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010949
10950 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010951 sub = PyUnicode_FromObject(element);
10952 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010953 PyErr_Format(PyExc_TypeError,
10954 "'in <string>' requires string as left operand, not %s",
10955 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010956 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010957 }
10958
Thomas Wouters477c8d52006-05-27 19:21:47 +000010959 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010960 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010961 Py_DECREF(sub);
10962 return -1;
10963 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010964 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10965 Py_DECREF(sub);
10966 Py_DECREF(str);
10967 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 kind1 = PyUnicode_KIND(str);
10970 kind2 = PyUnicode_KIND(sub);
10971 kind = kind1 > kind2 ? kind1 : kind2;
10972 buf1 = PyUnicode_DATA(str);
10973 buf2 = PyUnicode_DATA(sub);
10974 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010975 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 if (!buf1) {
10977 Py_DECREF(sub);
10978 return -1;
10979 }
10980 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010981 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 if (!buf2) {
10983 Py_DECREF(sub);
10984 if (kind1 != kind) PyMem_Free(buf1);
10985 return -1;
10986 }
10987 len1 = PyUnicode_GET_LENGTH(str);
10988 len2 = PyUnicode_GET_LENGTH(sub);
10989
Benjamin Petersonead6b532011-12-20 17:23:42 -060010990 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 case PyUnicode_1BYTE_KIND:
10992 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10993 break;
10994 case PyUnicode_2BYTE_KIND:
10995 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10996 break;
10997 case PyUnicode_4BYTE_KIND:
10998 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10999 break;
11000 default:
11001 result = -1;
11002 assert(0);
11003 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011004
11005 Py_DECREF(str);
11006 Py_DECREF(sub);
11007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 if (kind1 != kind)
11009 PyMem_Free(buf1);
11010 if (kind2 != kind)
11011 PyMem_Free(buf2);
11012
Guido van Rossum403d68b2000-03-13 15:55:09 +000011013 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011014}
11015
Guido van Rossumd57fd912000-03-10 22:53:23 +000011016/* Concat to string or Unicode object giving a new Unicode object. */
11017
Alexander Belopolsky40018472011-02-26 01:02:56 +000011018PyObject *
11019PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011021 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020011022 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010011023 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024
11025 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011028 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011031 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032
11033 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011034 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011035 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011038 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011039 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041 }
11042
Victor Stinner488fa492011-12-12 00:01:39 +010011043 u_len = PyUnicode_GET_LENGTH(u);
11044 v_len = PyUnicode_GET_LENGTH(v);
11045 if (u_len > PY_SSIZE_T_MAX - v_len) {
11046 PyErr_SetString(PyExc_OverflowError,
11047 "strings are too large to concat");
11048 goto onError;
11049 }
11050 new_len = u_len + v_len;
11051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011053 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020011054 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011055
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011057 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011059 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010011060 copy_characters(w, 0, u, 0, u_len);
11061 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062 Py_DECREF(u);
11063 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011064 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066
Benjamin Peterson29060642009-01-31 22:14:21 +000011067 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068 Py_XDECREF(u);
11069 Py_XDECREF(v);
11070 return NULL;
11071}
11072
Walter Dörwald1ab83302007-05-18 17:15:44 +000011073void
Victor Stinner23e56682011-10-03 03:54:37 +020011074PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011075{
Victor Stinner23e56682011-10-03 03:54:37 +020011076 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011077 Py_UCS4 maxchar, maxchar2;
11078 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011079
11080 if (p_left == NULL) {
11081 if (!PyErr_Occurred())
11082 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011083 return;
11084 }
Victor Stinner23e56682011-10-03 03:54:37 +020011085 left = *p_left;
11086 if (right == NULL || !PyUnicode_Check(left)) {
11087 if (!PyErr_Occurred())
11088 PyErr_BadInternalCall();
11089 goto error;
11090 }
11091
Benjamin Petersonbac79492012-01-14 13:34:47 -050011092 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011093 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011094 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011095 goto error;
11096
Victor Stinner488fa492011-12-12 00:01:39 +010011097 /* Shortcuts */
11098 if (left == unicode_empty) {
11099 Py_DECREF(left);
11100 Py_INCREF(right);
11101 *p_left = right;
11102 return;
11103 }
11104 if (right == unicode_empty)
11105 return;
11106
11107 left_len = PyUnicode_GET_LENGTH(left);
11108 right_len = PyUnicode_GET_LENGTH(right);
11109 if (left_len > PY_SSIZE_T_MAX - right_len) {
11110 PyErr_SetString(PyExc_OverflowError,
11111 "strings are too large to concat");
11112 goto error;
11113 }
11114 new_len = left_len + right_len;
11115
11116 if (unicode_modifiable(left)
11117 && PyUnicode_CheckExact(right)
11118 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011119 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11120 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011121 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011122 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011123 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11124 {
11125 /* append inplace */
11126 if (unicode_resize(p_left, new_len) != 0) {
11127 /* XXX if _PyUnicode_Resize() fails, 'left' has been
11128 * deallocated so it cannot be put back into
11129 * 'variable'. The MemoryError is raised when there
11130 * is no value in 'variable', which might (very
11131 * remotely) be a cause of incompatibilities.
11132 */
11133 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020011134 }
Victor Stinner488fa492011-12-12 00:01:39 +010011135 /* copy 'right' into the newly allocated area of 'left' */
11136 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011137 }
Victor Stinner488fa492011-12-12 00:01:39 +010011138 else {
11139 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11140 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020011141 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011142
Victor Stinner488fa492011-12-12 00:01:39 +010011143 /* Concat the two Unicode strings */
11144 res = PyUnicode_New(new_len, maxchar);
11145 if (res == NULL)
11146 goto error;
11147 copy_characters(res, 0, left, 0, left_len);
11148 copy_characters(res, left_len, right, 0, right_len);
11149 Py_DECREF(left);
11150 *p_left = res;
11151 }
11152 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011153 return;
11154
11155error:
Victor Stinner488fa492011-12-12 00:01:39 +010011156 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011157}
11158
11159void
11160PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11161{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011162 PyUnicode_Append(pleft, right);
11163 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011164}
11165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011166PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011167 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011169Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011170string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011171interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172
11173static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011174unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011176 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011177 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011178 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011179 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 int kind1, kind2, kind;
11181 void *buf1, *buf2;
11182 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183
Jesus Ceaac451502011-04-20 17:09:23 +020011184 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11185 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011186 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011188 kind1 = PyUnicode_KIND(self);
11189 kind2 = PyUnicode_KIND(substring);
11190 kind = kind1 > kind2 ? kind1 : kind2;
11191 buf1 = PyUnicode_DATA(self);
11192 buf2 = PyUnicode_DATA(substring);
11193 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011194 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195 if (!buf1) {
11196 Py_DECREF(substring);
11197 return NULL;
11198 }
11199 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011200 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201 if (!buf2) {
11202 Py_DECREF(substring);
11203 if (kind1 != kind) PyMem_Free(buf1);
11204 return NULL;
11205 }
11206 len1 = PyUnicode_GET_LENGTH(self);
11207 len2 = PyUnicode_GET_LENGTH(substring);
11208
11209 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011210 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 case PyUnicode_1BYTE_KIND:
11212 iresult = ucs1lib_count(
11213 ((Py_UCS1*)buf1) + start, end - start,
11214 buf2, len2, PY_SSIZE_T_MAX
11215 );
11216 break;
11217 case PyUnicode_2BYTE_KIND:
11218 iresult = ucs2lib_count(
11219 ((Py_UCS2*)buf1) + start, end - start,
11220 buf2, len2, PY_SSIZE_T_MAX
11221 );
11222 break;
11223 case PyUnicode_4BYTE_KIND:
11224 iresult = ucs4lib_count(
11225 ((Py_UCS4*)buf1) + start, end - start,
11226 buf2, len2, PY_SSIZE_T_MAX
11227 );
11228 break;
11229 default:
11230 assert(0); iresult = 0;
11231 }
11232
11233 result = PyLong_FromSsize_t(iresult);
11234
11235 if (kind1 != kind)
11236 PyMem_Free(buf1);
11237 if (kind2 != kind)
11238 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239
11240 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011241
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242 return result;
11243}
11244
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011245PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011246 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011248Encode S using the codec registered for encoding. Default encoding\n\
11249is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011250handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011251a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11252'xmlcharrefreplace' as well as any other name registered with\n\
11253codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254
11255static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011256unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011258 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259 char *encoding = NULL;
11260 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011261
Benjamin Peterson308d6372009-09-18 21:42:35 +000011262 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11263 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011265 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011266}
11267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011268PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270\n\
11271Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011272If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273
11274static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011275unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011277 Py_ssize_t i, j, line_pos, src_len, incr;
11278 Py_UCS4 ch;
11279 PyObject *u;
11280 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011282 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011283 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284
11285 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011286 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287
Antoine Pitrou22425222011-10-04 19:10:51 +020011288 if (PyUnicode_READY(self) == -1)
11289 return NULL;
11290
Thomas Wouters7e474022000-07-16 12:04:32 +000011291 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011292 src_len = PyUnicode_GET_LENGTH(self);
11293 i = j = line_pos = 0;
11294 kind = PyUnicode_KIND(self);
11295 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011296 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011297 for (; i < src_len; i++) {
11298 ch = PyUnicode_READ(kind, src_data, i);
11299 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011300 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011301 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011302 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011303 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011304 goto overflow;
11305 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011306 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011307 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011308 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011310 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011311 goto overflow;
11312 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011314 if (ch == '\n' || ch == '\r')
11315 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011317 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011318 if (!found)
11319 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011320
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011322 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323 if (!u)
11324 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011325 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326
Antoine Pitroue71d5742011-10-04 15:55:09 +020011327 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328
Antoine Pitroue71d5742011-10-04 15:55:09 +020011329 for (; i < src_len; i++) {
11330 ch = PyUnicode_READ(kind, src_data, i);
11331 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011332 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011333 incr = tabsize - (line_pos % tabsize);
11334 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011335 FILL(kind, dest_data, ' ', j, incr);
11336 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011337 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011338 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011339 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011340 line_pos++;
11341 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011342 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011343 if (ch == '\n' || ch == '\r')
11344 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011346 }
11347 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011348 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011349
Antoine Pitroue71d5742011-10-04 15:55:09 +020011350 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011351 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11352 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353}
11354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011355PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011356 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357\n\
11358Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011359such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360arguments start and end are interpreted as in slice notation.\n\
11361\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011362Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363
11364static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011367 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011368 Py_ssize_t start;
11369 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011370 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371
Jesus Ceaac451502011-04-20 17:09:23 +020011372 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11373 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011376 if (PyUnicode_READY(self) == -1)
11377 return NULL;
11378 if (PyUnicode_READY(substring) == -1)
11379 return NULL;
11380
Victor Stinner7931d9a2011-11-04 00:22:48 +010011381 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382
11383 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011384
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011385 if (result == -2)
11386 return NULL;
11387
Christian Heimes217cfd12007-12-02 14:31:20 +000011388 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389}
11390
11391static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011392unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011394 void *data;
11395 enum PyUnicode_Kind kind;
11396 Py_UCS4 ch;
11397 PyObject *res;
11398
11399 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11400 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011401 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011402 }
11403 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11404 PyErr_SetString(PyExc_IndexError, "string index out of range");
11405 return NULL;
11406 }
11407 kind = PyUnicode_KIND(self);
11408 data = PyUnicode_DATA(self);
11409 ch = PyUnicode_READ(kind, data, index);
11410 if (ch < 256)
11411 return get_latin1_char(ch);
11412
11413 res = PyUnicode_New(1, ch);
11414 if (res == NULL)
11415 return NULL;
11416 kind = PyUnicode_KIND(res);
11417 data = PyUnicode_DATA(res);
11418 PyUnicode_WRITE(kind, data, 0, ch);
11419 assert(_PyUnicode_CheckConsistency(res, 1));
11420 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421}
11422
Guido van Rossumc2504932007-09-18 19:42:40 +000011423/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011424 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011425static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011426unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427{
Guido van Rossumc2504932007-09-18 19:42:40 +000011428 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011429 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011430
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011431#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011432 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011433#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 if (_PyUnicode_HASH(self) != -1)
11435 return _PyUnicode_HASH(self);
11436 if (PyUnicode_READY(self) == -1)
11437 return -1;
11438 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011439 /*
11440 We make the hash of the empty string be 0, rather than using
11441 (prefix ^ suffix), since this slightly obfuscates the hash secret
11442 */
11443 if (len == 0) {
11444 _PyUnicode_HASH(self) = 0;
11445 return 0;
11446 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447
11448 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011449#define HASH(P) \
11450 x ^= (Py_uhash_t) *P << 7; \
11451 while (--len >= 0) \
11452 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453
Georg Brandl2fb477c2012-02-21 00:33:36 +010011454 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 switch (PyUnicode_KIND(self)) {
11456 case PyUnicode_1BYTE_KIND: {
11457 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11458 HASH(c);
11459 break;
11460 }
11461 case PyUnicode_2BYTE_KIND: {
11462 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11463 HASH(s);
11464 break;
11465 }
11466 default: {
11467 Py_UCS4 *l;
11468 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11469 "Impossible switch case in unicode_hash");
11470 l = PyUnicode_4BYTE_DATA(self);
11471 HASH(l);
11472 break;
11473 }
11474 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011475 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11476 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477
Guido van Rossumc2504932007-09-18 19:42:40 +000011478 if (x == -1)
11479 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011481 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011485PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011486 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011488Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489
11490static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011491unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011493 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011494 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011495 Py_ssize_t start;
11496 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497
Jesus Ceaac451502011-04-20 17:09:23 +020011498 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11499 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 if (PyUnicode_READY(self) == -1)
11503 return NULL;
11504 if (PyUnicode_READY(substring) == -1)
11505 return NULL;
11506
Victor Stinner7931d9a2011-11-04 00:22:48 +010011507 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508
11509 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011511 if (result == -2)
11512 return NULL;
11513
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514 if (result < 0) {
11515 PyErr_SetString(PyExc_ValueError, "substring not found");
11516 return NULL;
11517 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011518
Christian Heimes217cfd12007-12-02 14:31:20 +000011519 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520}
11521
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011522PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011523 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011525Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011526at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527
11528static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011529unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531 Py_ssize_t i, length;
11532 int kind;
11533 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534 int cased;
11535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 if (PyUnicode_READY(self) == -1)
11537 return NULL;
11538 length = PyUnicode_GET_LENGTH(self);
11539 kind = PyUnicode_KIND(self);
11540 data = PyUnicode_DATA(self);
11541
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 if (length == 1)
11544 return PyBool_FromLong(
11545 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011547 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011549 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011550
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552 for (i = 0; i < length; i++) {
11553 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011554
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11556 return PyBool_FromLong(0);
11557 else if (!cased && Py_UNICODE_ISLOWER(ch))
11558 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011560 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561}
11562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011563PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011564 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011566Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011567at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568
11569static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011570unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 Py_ssize_t i, length;
11573 int kind;
11574 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575 int cased;
11576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011577 if (PyUnicode_READY(self) == -1)
11578 return NULL;
11579 length = PyUnicode_GET_LENGTH(self);
11580 kind = PyUnicode_KIND(self);
11581 data = PyUnicode_DATA(self);
11582
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584 if (length == 1)
11585 return PyBool_FromLong(
11586 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011588 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011589 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011590 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011591
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011593 for (i = 0; i < length; i++) {
11594 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011595
Benjamin Peterson29060642009-01-31 22:14:21 +000011596 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11597 return PyBool_FromLong(0);
11598 else if (!cased && Py_UNICODE_ISUPPER(ch))
11599 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011601 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602}
11603
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011604PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011605 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011607Return True if S is a titlecased string and there is at least one\n\
11608character in S, i.e. upper- and titlecase characters may only\n\
11609follow uncased characters and lowercase characters only cased ones.\n\
11610Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611
11612static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011613unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615 Py_ssize_t i, length;
11616 int kind;
11617 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618 int cased, previous_is_cased;
11619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 if (PyUnicode_READY(self) == -1)
11621 return NULL;
11622 length = PyUnicode_GET_LENGTH(self);
11623 kind = PyUnicode_KIND(self);
11624 data = PyUnicode_DATA(self);
11625
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 if (length == 1) {
11628 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11629 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11630 (Py_UNICODE_ISUPPER(ch) != 0));
11631 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011633 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011634 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011635 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011636
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637 cased = 0;
11638 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 for (i = 0; i < length; i++) {
11640 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011641
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11643 if (previous_is_cased)
11644 return PyBool_FromLong(0);
11645 previous_is_cased = 1;
11646 cased = 1;
11647 }
11648 else if (Py_UNICODE_ISLOWER(ch)) {
11649 if (!previous_is_cased)
11650 return PyBool_FromLong(0);
11651 previous_is_cased = 1;
11652 cased = 1;
11653 }
11654 else
11655 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011657 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658}
11659
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011660PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011661 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011663Return True if all characters in S are whitespace\n\
11664and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665
11666static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011667unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011669 Py_ssize_t i, length;
11670 int kind;
11671 void *data;
11672
11673 if (PyUnicode_READY(self) == -1)
11674 return NULL;
11675 length = PyUnicode_GET_LENGTH(self);
11676 kind = PyUnicode_KIND(self);
11677 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 if (length == 1)
11681 return PyBool_FromLong(
11682 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011684 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011686 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688 for (i = 0; i < length; i++) {
11689 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011690 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011691 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011693 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694}
11695
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011696PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011697 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011698\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011699Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011700and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011701
11702static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011703unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011704{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 Py_ssize_t i, length;
11706 int kind;
11707 void *data;
11708
11709 if (PyUnicode_READY(self) == -1)
11710 return NULL;
11711 length = PyUnicode_GET_LENGTH(self);
11712 kind = PyUnicode_KIND(self);
11713 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011714
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011715 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 if (length == 1)
11717 return PyBool_FromLong(
11718 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011719
11720 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011722 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011724 for (i = 0; i < length; i++) {
11725 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011726 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011727 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011728 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011729}
11730
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011731PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011732 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011733\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011734Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011735and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011736
11737static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011738unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011739{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740 int kind;
11741 void *data;
11742 Py_ssize_t len, i;
11743
11744 if (PyUnicode_READY(self) == -1)
11745 return NULL;
11746
11747 kind = PyUnicode_KIND(self);
11748 data = PyUnicode_DATA(self);
11749 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011750
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011751 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 if (len == 1) {
11753 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11754 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11755 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011756
11757 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011759 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 for (i = 0; i < len; i++) {
11762 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011763 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011764 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011765 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011766 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011767}
11768
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011769PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011770 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011772Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011773False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774
11775static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011776unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778 Py_ssize_t i, length;
11779 int kind;
11780 void *data;
11781
11782 if (PyUnicode_READY(self) == -1)
11783 return NULL;
11784 length = PyUnicode_GET_LENGTH(self);
11785 kind = PyUnicode_KIND(self);
11786 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 if (length == 1)
11790 return PyBool_FromLong(
11791 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011793 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011795 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011797 for (i = 0; i < length; i++) {
11798 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011799 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011801 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802}
11803
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011804PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011805 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011807Return True if all characters in S are digits\n\
11808and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809
11810static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011811unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 Py_ssize_t i, length;
11814 int kind;
11815 void *data;
11816
11817 if (PyUnicode_READY(self) == -1)
11818 return NULL;
11819 length = PyUnicode_GET_LENGTH(self);
11820 kind = PyUnicode_KIND(self);
11821 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 if (length == 1) {
11825 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11826 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11827 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011829 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011831 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 for (i = 0; i < length; i++) {
11834 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011835 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011837 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838}
11839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011840PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011843Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011844False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845
11846static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011847unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011849 Py_ssize_t i, length;
11850 int kind;
11851 void *data;
11852
11853 if (PyUnicode_READY(self) == -1)
11854 return NULL;
11855 length = PyUnicode_GET_LENGTH(self);
11856 kind = PyUnicode_KIND(self);
11857 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 if (length == 1)
11861 return PyBool_FromLong(
11862 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011864 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011866 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 for (i = 0; i < length; i++) {
11869 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011870 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011872 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873}
11874
Martin v. Löwis47383402007-08-15 07:32:56 +000011875int
11876PyUnicode_IsIdentifier(PyObject *self)
11877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 int kind;
11879 void *data;
11880 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011881 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011882
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883 if (PyUnicode_READY(self) == -1) {
11884 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011885 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011886 }
11887
11888 /* Special case for empty strings */
11889 if (PyUnicode_GET_LENGTH(self) == 0)
11890 return 0;
11891 kind = PyUnicode_KIND(self);
11892 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011893
11894 /* PEP 3131 says that the first character must be in
11895 XID_Start and subsequent characters in XID_Continue,
11896 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011897 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011898 letters, digits, underscore). However, given the current
11899 definition of XID_Start and XID_Continue, it is sufficient
11900 to check just for these, except that _ must be allowed
11901 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011903 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011904 return 0;
11905
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011906 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011907 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011908 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011909 return 1;
11910}
11911
11912PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011913 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011914\n\
11915Return True if S is a valid identifier according\n\
11916to the language definition.");
11917
11918static PyObject*
11919unicode_isidentifier(PyObject *self)
11920{
11921 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11922}
11923
Georg Brandl559e5d72008-06-11 18:37:52 +000011924PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011925 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011926\n\
11927Return True if all characters in S are considered\n\
11928printable in repr() or S is empty, False otherwise.");
11929
11930static PyObject*
11931unicode_isprintable(PyObject *self)
11932{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 Py_ssize_t i, length;
11934 int kind;
11935 void *data;
11936
11937 if (PyUnicode_READY(self) == -1)
11938 return NULL;
11939 length = PyUnicode_GET_LENGTH(self);
11940 kind = PyUnicode_KIND(self);
11941 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011942
11943 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 if (length == 1)
11945 return PyBool_FromLong(
11946 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011947
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 for (i = 0; i < length; i++) {
11949 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011950 Py_RETURN_FALSE;
11951 }
11952 }
11953 Py_RETURN_TRUE;
11954}
11955
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011956PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011957 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958\n\
11959Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011960iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961
11962static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011963unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011965 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966}
11967
Martin v. Löwis18e16552006-02-15 17:27:45 +000011968static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011969unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 if (PyUnicode_READY(self) == -1)
11972 return -1;
11973 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974}
11975
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011976PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011977 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011979Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011980done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981
11982static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011983unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011985 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 Py_UCS4 fillchar = ' ';
11987
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011988 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989 return NULL;
11990
Benjamin Petersonbac79492012-01-14 13:34:47 -050011991 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011992 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993
Victor Stinnerc4b49542011-12-11 22:44:26 +010011994 if (PyUnicode_GET_LENGTH(self) >= width)
11995 return unicode_result_unchanged(self);
11996
11997 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011998}
11999
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012000PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012001 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012003Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004
12005static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012006unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012008 if (PyUnicode_READY(self) == -1)
12009 return NULL;
12010 if (PyUnicode_IS_ASCII(self))
12011 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012012 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013}
12014
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012015#define LEFTSTRIP 0
12016#define RIGHTSTRIP 1
12017#define BOTHSTRIP 2
12018
12019/* Arrays indexed by above */
12020static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
12021
12022#define STRIPNAME(i) (stripformat[i]+3)
12023
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012024/* externally visible for str.strip(unicode) */
12025PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012026_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012027{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 void *data;
12029 int kind;
12030 Py_ssize_t i, j, len;
12031 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12034 return NULL;
12035
12036 kind = PyUnicode_KIND(self);
12037 data = PyUnicode_DATA(self);
12038 len = PyUnicode_GET_LENGTH(self);
12039 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12040 PyUnicode_DATA(sepobj),
12041 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000012042
Benjamin Peterson14339b62009-01-31 16:36:08 +000012043 i = 0;
12044 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045 while (i < len &&
12046 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012047 i++;
12048 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012049 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012050
Benjamin Peterson14339b62009-01-31 16:36:08 +000012051 j = len;
12052 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012053 do {
12054 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055 } while (j >= i &&
12056 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000012057 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012058 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012059
Victor Stinner7931d9a2011-11-04 00:22:48 +010012060 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012061}
12062
12063PyObject*
12064PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12065{
12066 unsigned char *data;
12067 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012068 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069
Victor Stinnerde636f32011-10-01 03:55:54 +020012070 if (PyUnicode_READY(self) == -1)
12071 return NULL;
12072
Victor Stinner684d5fd2012-05-03 02:32:34 +020012073 length = PyUnicode_GET_LENGTH(self);
12074 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012075
Victor Stinner684d5fd2012-05-03 02:32:34 +020012076 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012077 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078
Victor Stinnerde636f32011-10-01 03:55:54 +020012079 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012080 PyErr_SetString(PyExc_IndexError, "string index out of range");
12081 return NULL;
12082 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020012083 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020012084 Py_INCREF(unicode_empty);
12085 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020012086 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012087
Victor Stinner684d5fd2012-05-03 02:32:34 +020012088 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012089 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012090 data = PyUnicode_1BYTE_DATA(self);
12091 return unicode_fromascii(data + start, length);
12092 }
12093 else {
12094 kind = PyUnicode_KIND(self);
12095 data = PyUnicode_1BYTE_DATA(self);
12096 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012097 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012098 length);
12099 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101
12102static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012103do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 int kind;
12106 void *data;
12107 Py_ssize_t len, i, j;
12108
12109 if (PyUnicode_READY(self) == -1)
12110 return NULL;
12111
12112 kind = PyUnicode_KIND(self);
12113 data = PyUnicode_DATA(self);
12114 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012115
Benjamin Peterson14339b62009-01-31 16:36:08 +000012116 i = 0;
12117 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012119 i++;
12120 }
12121 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012122
Benjamin Peterson14339b62009-01-31 16:36:08 +000012123 j = len;
12124 if (striptype != LEFTSTRIP) {
12125 do {
12126 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012128 j++;
12129 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012130
Victor Stinner7931d9a2011-11-04 00:22:48 +010012131 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132}
12133
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012134
12135static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012136do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012137{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012138 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012139
Benjamin Peterson14339b62009-01-31 16:36:08 +000012140 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
12141 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012142
Benjamin Peterson14339b62009-01-31 16:36:08 +000012143 if (sep != NULL && sep != Py_None) {
12144 if (PyUnicode_Check(sep))
12145 return _PyUnicode_XStrip(self, striptype, sep);
12146 else {
12147 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012148 "%s arg must be None or str",
12149 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012150 return NULL;
12151 }
12152 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012153
Benjamin Peterson14339b62009-01-31 16:36:08 +000012154 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012155}
12156
12157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012158PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012159 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012160\n\
12161Return a copy of the string S with leading and trailing\n\
12162whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012163If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012164
12165static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012166unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012167{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012168 if (PyTuple_GET_SIZE(args) == 0)
12169 return do_strip(self, BOTHSTRIP); /* Common case */
12170 else
12171 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012172}
12173
12174
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012175PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012176 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012177\n\
12178Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012179If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012180
12181static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012182unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012183{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012184 if (PyTuple_GET_SIZE(args) == 0)
12185 return do_strip(self, LEFTSTRIP); /* Common case */
12186 else
12187 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012188}
12189
12190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012191PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012192 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012193\n\
12194Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012195If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012196
12197static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012198unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012199{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012200 if (PyTuple_GET_SIZE(args) == 0)
12201 return do_strip(self, RIGHTSTRIP); /* Common case */
12202 else
12203 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012204}
12205
12206
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012208unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012210 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012211 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212
Georg Brandl222de0f2009-04-12 12:01:50 +000012213 if (len < 1) {
12214 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020012215 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000012216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217
Victor Stinnerc4b49542011-12-11 22:44:26 +010012218 /* no repeat, return original string */
12219 if (len == 1)
12220 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012221
Benjamin Petersonbac79492012-01-14 13:34:47 -050012222 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 return NULL;
12224
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012225 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012226 PyErr_SetString(PyExc_OverflowError,
12227 "repeated string is too long");
12228 return NULL;
12229 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012230 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012231
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012232 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233 if (!u)
12234 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012235 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 if (PyUnicode_GET_LENGTH(str) == 1) {
12238 const int kind = PyUnicode_KIND(str);
12239 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012240 if (kind == PyUnicode_1BYTE_KIND) {
12241 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012242 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012243 }
12244 else if (kind == PyUnicode_2BYTE_KIND) {
12245 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012246 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012247 ucs2[n] = fill_char;
12248 } else {
12249 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12250 assert(kind == PyUnicode_4BYTE_KIND);
12251 for (n = 0; n < len; ++n)
12252 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012253 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254 }
12255 else {
12256 /* number of characters copied this far */
12257 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012258 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 char *to = (char *) PyUnicode_DATA(u);
12260 Py_MEMCPY(to, PyUnicode_DATA(str),
12261 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012262 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012263 n = (done <= nchars-done) ? done : nchars-done;
12264 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012265 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012266 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267 }
12268
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012269 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012270 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271}
12272
Alexander Belopolsky40018472011-02-26 01:02:56 +000012273PyObject *
12274PyUnicode_Replace(PyObject *obj,
12275 PyObject *subobj,
12276 PyObject *replobj,
12277 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278{
12279 PyObject *self;
12280 PyObject *str1;
12281 PyObject *str2;
12282 PyObject *result;
12283
12284 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012285 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012286 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012288 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012289 Py_DECREF(self);
12290 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291 }
12292 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012293 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012294 Py_DECREF(self);
12295 Py_DECREF(str1);
12296 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012298 if (PyUnicode_READY(self) == -1 ||
12299 PyUnicode_READY(str1) == -1 ||
12300 PyUnicode_READY(str2) == -1)
12301 result = NULL;
12302 else
12303 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304 Py_DECREF(self);
12305 Py_DECREF(str1);
12306 Py_DECREF(str2);
12307 return result;
12308}
12309
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012310PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012311 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012312\n\
12313Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012314old replaced by new. If the optional argument count is\n\
12315given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012316
12317static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012319{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320 PyObject *str1;
12321 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012322 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323 PyObject *result;
12324
Martin v. Löwis18e16552006-02-15 17:27:45 +000012325 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012326 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012327 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012328 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012330 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 return NULL;
12332 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012333 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012334 Py_DECREF(str1);
12335 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012336 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012337 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12338 result = NULL;
12339 else
12340 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012341
12342 Py_DECREF(str1);
12343 Py_DECREF(str2);
12344 return result;
12345}
12346
Alexander Belopolsky40018472011-02-26 01:02:56 +000012347static PyObject *
12348unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012349{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012350 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 Py_ssize_t isize;
12352 Py_ssize_t osize, squote, dquote, i, o;
12353 Py_UCS4 max, quote;
12354 int ikind, okind;
12355 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012358 return NULL;
12359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360 isize = PyUnicode_GET_LENGTH(unicode);
12361 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012363 /* Compute length of output, quote characters, and
12364 maximum character */
12365 osize = 2; /* quotes */
12366 max = 127;
12367 squote = dquote = 0;
12368 ikind = PyUnicode_KIND(unicode);
12369 for (i = 0; i < isize; i++) {
12370 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12371 switch (ch) {
12372 case '\'': squote++; osize++; break;
12373 case '"': dquote++; osize++; break;
12374 case '\\': case '\t': case '\r': case '\n':
12375 osize += 2; break;
12376 default:
12377 /* Fast-path ASCII */
12378 if (ch < ' ' || ch == 0x7f)
12379 osize += 4; /* \xHH */
12380 else if (ch < 0x7f)
12381 osize++;
12382 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12383 osize++;
12384 max = ch > max ? ch : max;
12385 }
12386 else if (ch < 0x100)
12387 osize += 4; /* \xHH */
12388 else if (ch < 0x10000)
12389 osize += 6; /* \uHHHH */
12390 else
12391 osize += 10; /* \uHHHHHHHH */
12392 }
12393 }
12394
12395 quote = '\'';
12396 if (squote) {
12397 if (dquote)
12398 /* Both squote and dquote present. Use squote,
12399 and escape them */
12400 osize += squote;
12401 else
12402 quote = '"';
12403 }
12404
12405 repr = PyUnicode_New(osize, max);
12406 if (repr == NULL)
12407 return NULL;
12408 okind = PyUnicode_KIND(repr);
12409 odata = PyUnicode_DATA(repr);
12410
12411 PyUnicode_WRITE(okind, odata, 0, quote);
12412 PyUnicode_WRITE(okind, odata, osize-1, quote);
12413
12414 for (i = 0, o = 1; i < isize; i++) {
12415 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012416
12417 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418 if ((ch == quote) || (ch == '\\')) {
12419 PyUnicode_WRITE(okind, odata, o++, '\\');
12420 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012421 continue;
12422 }
12423
Benjamin Peterson29060642009-01-31 22:14:21 +000012424 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012425 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012426 PyUnicode_WRITE(okind, odata, o++, '\\');
12427 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012428 }
12429 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 PyUnicode_WRITE(okind, odata, o++, '\\');
12431 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012432 }
12433 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012434 PyUnicode_WRITE(okind, odata, o++, '\\');
12435 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012436 }
12437
12438 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012439 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012440 PyUnicode_WRITE(okind, odata, o++, '\\');
12441 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012442 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12443 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012444 }
12445
Georg Brandl559e5d72008-06-11 18:37:52 +000012446 /* Copy ASCII characters as-is */
12447 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012449 }
12450
Benjamin Peterson29060642009-01-31 22:14:21 +000012451 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012452 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012453 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012454 (categories Z* and C* except ASCII space)
12455 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012457 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012458 if (ch <= 0xff) {
12459 PyUnicode_WRITE(okind, odata, o++, '\\');
12460 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012461 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12462 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012463 }
12464 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 else if (ch >= 0x10000) {
12466 PyUnicode_WRITE(okind, odata, o++, '\\');
12467 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012468 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12469 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12470 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12471 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12472 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12473 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12474 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12475 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012476 }
12477 /* Map 16-bit characters to '\uxxxx' */
12478 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 PyUnicode_WRITE(okind, odata, o++, '\\');
12480 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012481 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12482 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12483 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12484 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012485 }
12486 }
12487 /* Copy characters as-is */
12488 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012489 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012490 }
12491 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012492 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012493 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012494 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012495 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012496}
12497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012498PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012499 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500\n\
12501Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012502such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503arguments start and end are interpreted as in slice notation.\n\
12504\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012505Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506
12507static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012508unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012509{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012510 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012511 Py_ssize_t start;
12512 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012513 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012514
Jesus Ceaac451502011-04-20 17:09:23 +020012515 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12516 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012517 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519 if (PyUnicode_READY(self) == -1)
12520 return NULL;
12521 if (PyUnicode_READY(substring) == -1)
12522 return NULL;
12523
Victor Stinner7931d9a2011-11-04 00:22:48 +010012524 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525
12526 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 if (result == -2)
12529 return NULL;
12530
Christian Heimes217cfd12007-12-02 14:31:20 +000012531 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532}
12533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012534PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012535 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012537Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538
12539static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012540unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012542 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012543 Py_ssize_t start;
12544 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012545 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546
Jesus Ceaac451502011-04-20 17:09:23 +020012547 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12548 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012549 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 if (PyUnicode_READY(self) == -1)
12552 return NULL;
12553 if (PyUnicode_READY(substring) == -1)
12554 return NULL;
12555
Victor Stinner7931d9a2011-11-04 00:22:48 +010012556 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557
12558 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012560 if (result == -2)
12561 return NULL;
12562
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563 if (result < 0) {
12564 PyErr_SetString(PyExc_ValueError, "substring not found");
12565 return NULL;
12566 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567
Christian Heimes217cfd12007-12-02 14:31:20 +000012568 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569}
12570
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012571PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012572 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012574Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012575done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576
12577static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012578unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012580 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 Py_UCS4 fillchar = ' ';
12582
Victor Stinnere9a29352011-10-01 02:14:59 +020012583 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012585
Benjamin Petersonbac79492012-01-14 13:34:47 -050012586 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587 return NULL;
12588
Victor Stinnerc4b49542011-12-11 22:44:26 +010012589 if (PyUnicode_GET_LENGTH(self) >= width)
12590 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591
Victor Stinnerc4b49542011-12-11 22:44:26 +010012592 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593}
12594
Alexander Belopolsky40018472011-02-26 01:02:56 +000012595PyObject *
12596PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012597{
12598 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012599
Guido van Rossumd57fd912000-03-10 22:53:23 +000012600 s = PyUnicode_FromObject(s);
12601 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012602 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012603 if (sep != NULL) {
12604 sep = PyUnicode_FromObject(sep);
12605 if (sep == NULL) {
12606 Py_DECREF(s);
12607 return NULL;
12608 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609 }
12610
Victor Stinner9310abb2011-10-05 00:59:23 +020012611 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012612
12613 Py_DECREF(s);
12614 Py_XDECREF(sep);
12615 return result;
12616}
12617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012618PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012619 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620\n\
12621Return a list of the words in S, using sep as the\n\
12622delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012623splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012624whitespace string is a separator and empty strings are\n\
12625removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626
12627static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012628unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012630 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012632 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012634 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12635 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636 return NULL;
12637
12638 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012639 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012641 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012642 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012643 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644}
12645
Thomas Wouters477c8d52006-05-27 19:21:47 +000012646PyObject *
12647PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12648{
12649 PyObject* str_obj;
12650 PyObject* sep_obj;
12651 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 int kind1, kind2, kind;
12653 void *buf1 = NULL, *buf2 = NULL;
12654 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012655
12656 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012657 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012658 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012659 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012660 if (!sep_obj) {
12661 Py_DECREF(str_obj);
12662 return NULL;
12663 }
12664 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12665 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012666 Py_DECREF(str_obj);
12667 return NULL;
12668 }
12669
Victor Stinner14f8f022011-10-05 20:58:25 +020012670 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012672 kind = Py_MAX(kind1, kind2);
12673 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012674 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012675 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 if (!buf1)
12677 goto onError;
12678 buf2 = PyUnicode_DATA(sep_obj);
12679 if (kind2 != kind)
12680 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12681 if (!buf2)
12682 goto onError;
12683 len1 = PyUnicode_GET_LENGTH(str_obj);
12684 len2 = PyUnicode_GET_LENGTH(sep_obj);
12685
Benjamin Petersonead6b532011-12-20 17:23:42 -060012686 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012687 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012688 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12689 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12690 else
12691 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012692 break;
12693 case PyUnicode_2BYTE_KIND:
12694 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12695 break;
12696 case PyUnicode_4BYTE_KIND:
12697 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12698 break;
12699 default:
12700 assert(0);
12701 out = 0;
12702 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012703
12704 Py_DECREF(sep_obj);
12705 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012706 if (kind1 != kind)
12707 PyMem_Free(buf1);
12708 if (kind2 != kind)
12709 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012710
12711 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712 onError:
12713 Py_DECREF(sep_obj);
12714 Py_DECREF(str_obj);
12715 if (kind1 != kind && buf1)
12716 PyMem_Free(buf1);
12717 if (kind2 != kind && buf2)
12718 PyMem_Free(buf2);
12719 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012720}
12721
12722
12723PyObject *
12724PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12725{
12726 PyObject* str_obj;
12727 PyObject* sep_obj;
12728 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012729 int kind1, kind2, kind;
12730 void *buf1 = NULL, *buf2 = NULL;
12731 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012732
12733 str_obj = PyUnicode_FromObject(str_in);
12734 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012735 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012736 sep_obj = PyUnicode_FromObject(sep_in);
12737 if (!sep_obj) {
12738 Py_DECREF(str_obj);
12739 return NULL;
12740 }
12741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012742 kind1 = PyUnicode_KIND(str_in);
12743 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012744 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 buf1 = PyUnicode_DATA(str_in);
12746 if (kind1 != kind)
12747 buf1 = _PyUnicode_AsKind(str_in, kind);
12748 if (!buf1)
12749 goto onError;
12750 buf2 = PyUnicode_DATA(sep_obj);
12751 if (kind2 != kind)
12752 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12753 if (!buf2)
12754 goto onError;
12755 len1 = PyUnicode_GET_LENGTH(str_obj);
12756 len2 = PyUnicode_GET_LENGTH(sep_obj);
12757
Benjamin Petersonead6b532011-12-20 17:23:42 -060012758 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012759 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012760 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12761 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12762 else
12763 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012764 break;
12765 case PyUnicode_2BYTE_KIND:
12766 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12767 break;
12768 case PyUnicode_4BYTE_KIND:
12769 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12770 break;
12771 default:
12772 assert(0);
12773 out = 0;
12774 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012775
12776 Py_DECREF(sep_obj);
12777 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012778 if (kind1 != kind)
12779 PyMem_Free(buf1);
12780 if (kind2 != kind)
12781 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012782
12783 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012784 onError:
12785 Py_DECREF(sep_obj);
12786 Py_DECREF(str_obj);
12787 if (kind1 != kind && buf1)
12788 PyMem_Free(buf1);
12789 if (kind2 != kind && buf2)
12790 PyMem_Free(buf2);
12791 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012792}
12793
12794PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012795 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012796\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012797Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012798the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012799found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012800
12801static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012802unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012803{
Victor Stinner9310abb2011-10-05 00:59:23 +020012804 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012805}
12806
12807PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012808 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012809\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012810Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012811the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012812separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012813
12814static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012815unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012816{
Victor Stinner9310abb2011-10-05 00:59:23 +020012817 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012818}
12819
Alexander Belopolsky40018472011-02-26 01:02:56 +000012820PyObject *
12821PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012822{
12823 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012824
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012825 s = PyUnicode_FromObject(s);
12826 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012827 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012828 if (sep != NULL) {
12829 sep = PyUnicode_FromObject(sep);
12830 if (sep == NULL) {
12831 Py_DECREF(s);
12832 return NULL;
12833 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012834 }
12835
Victor Stinner9310abb2011-10-05 00:59:23 +020012836 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012837
12838 Py_DECREF(s);
12839 Py_XDECREF(sep);
12840 return result;
12841}
12842
12843PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012844 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012845\n\
12846Return a list of the words in S, using sep as the\n\
12847delimiter string, starting at the end of the string and\n\
12848working to the front. If maxsplit is given, at most maxsplit\n\
12849splits are done. If sep is not specified, any whitespace string\n\
12850is a separator.");
12851
12852static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012853unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012854{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012855 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012856 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012857 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012858
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012859 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12860 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012861 return NULL;
12862
12863 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012864 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012865 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012866 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012867 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012868 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012869}
12870
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012871PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873\n\
12874Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012875Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012876is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877
12878static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012879unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012881 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012882 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012884 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12885 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886 return NULL;
12887
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012888 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889}
12890
12891static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012892PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012894 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895}
12896
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012897PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012898 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899\n\
12900Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012901and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902
12903static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012904unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012905{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012906 if (PyUnicode_READY(self) == -1)
12907 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012908 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012909}
12910
Georg Brandlceee0772007-11-27 23:48:05 +000012911PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012912 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012913\n\
12914Return a translation table usable for str.translate().\n\
12915If there is only one argument, it must be a dictionary mapping Unicode\n\
12916ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012917Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012918If there are two arguments, they must be strings of equal length, and\n\
12919in the resulting dictionary, each character in x will be mapped to the\n\
12920character at the same position in y. If there is a third argument, it\n\
12921must be a string, whose characters will be mapped to None in the result.");
12922
12923static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012924unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012925{
12926 PyObject *x, *y = NULL, *z = NULL;
12927 PyObject *new = NULL, *key, *value;
12928 Py_ssize_t i = 0;
12929 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012930
Georg Brandlceee0772007-11-27 23:48:05 +000012931 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12932 return NULL;
12933 new = PyDict_New();
12934 if (!new)
12935 return NULL;
12936 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937 int x_kind, y_kind, z_kind;
12938 void *x_data, *y_data, *z_data;
12939
Georg Brandlceee0772007-11-27 23:48:05 +000012940 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012941 if (!PyUnicode_Check(x)) {
12942 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12943 "be a string if there is a second argument");
12944 goto err;
12945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012946 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012947 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12948 "arguments must have equal length");
12949 goto err;
12950 }
12951 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012952 x_kind = PyUnicode_KIND(x);
12953 y_kind = PyUnicode_KIND(y);
12954 x_data = PyUnicode_DATA(x);
12955 y_data = PyUnicode_DATA(y);
12956 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12957 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012958 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012959 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012960 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012961 if (!value) {
12962 Py_DECREF(key);
12963 goto err;
12964 }
Georg Brandlceee0772007-11-27 23:48:05 +000012965 res = PyDict_SetItem(new, key, value);
12966 Py_DECREF(key);
12967 Py_DECREF(value);
12968 if (res < 0)
12969 goto err;
12970 }
12971 /* create entries for deleting chars in z */
12972 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012973 z_kind = PyUnicode_KIND(z);
12974 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012975 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012976 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012977 if (!key)
12978 goto err;
12979 res = PyDict_SetItem(new, key, Py_None);
12980 Py_DECREF(key);
12981 if (res < 0)
12982 goto err;
12983 }
12984 }
12985 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 int kind;
12987 void *data;
12988
Georg Brandlceee0772007-11-27 23:48:05 +000012989 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012990 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012991 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12992 "to maketrans it must be a dict");
12993 goto err;
12994 }
12995 /* copy entries into the new dict, converting string keys to int keys */
12996 while (PyDict_Next(x, &i, &key, &value)) {
12997 if (PyUnicode_Check(key)) {
12998 /* convert string keys to integer keys */
12999 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013000 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013001 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13002 "table must be of length 1");
13003 goto err;
13004 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 kind = PyUnicode_KIND(key);
13006 data = PyUnicode_DATA(key);
13007 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013008 if (!newkey)
13009 goto err;
13010 res = PyDict_SetItem(new, newkey, value);
13011 Py_DECREF(newkey);
13012 if (res < 0)
13013 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013014 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013015 /* just keep integer keys */
13016 if (PyDict_SetItem(new, key, value) < 0)
13017 goto err;
13018 } else {
13019 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13020 "be strings or integers");
13021 goto err;
13022 }
13023 }
13024 }
13025 return new;
13026 err:
13027 Py_DECREF(new);
13028 return NULL;
13029}
13030
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013031PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013032 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013033\n\
13034Return a copy of the string S, where all characters have been mapped\n\
13035through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000013036Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000013037Unmapped characters are left untouched. Characters mapped to None\n\
13038are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013039
13040static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013041unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013042{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013043 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013044}
13045
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013046PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013047 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013048\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013049Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013050
13051static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013052unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013053{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013054 if (PyUnicode_READY(self) == -1)
13055 return NULL;
13056 if (PyUnicode_IS_ASCII(self))
13057 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013058 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013059}
13060
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013061PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013062 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013063\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013064Pad a numeric string S with zeros on the left, to fill a field\n\
13065of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066
13067static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013068unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013069{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013070 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013071 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013072 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013073 int kind;
13074 void *data;
13075 Py_UCS4 chr;
13076
Martin v. Löwis18e16552006-02-15 17:27:45 +000013077 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013078 return NULL;
13079
Benjamin Petersonbac79492012-01-14 13:34:47 -050013080 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013081 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082
Victor Stinnerc4b49542011-12-11 22:44:26 +010013083 if (PyUnicode_GET_LENGTH(self) >= width)
13084 return unicode_result_unchanged(self);
13085
13086 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013087
13088 u = pad(self, fill, 0, '0');
13089
Walter Dörwald068325e2002-04-15 13:36:47 +000013090 if (u == NULL)
13091 return NULL;
13092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013093 kind = PyUnicode_KIND(u);
13094 data = PyUnicode_DATA(u);
13095 chr = PyUnicode_READ(kind, data, fill);
13096
13097 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013099 PyUnicode_WRITE(kind, data, 0, chr);
13100 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101 }
13102
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013103 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013104 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106
13107#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013108static PyObject *
13109unicode__decimal2ascii(PyObject *self)
13110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013111 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013112}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113#endif
13114
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013115PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013116 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013118Return True if S starts with the specified prefix, False otherwise.\n\
13119With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013120With optional end, stop comparing S at that position.\n\
13121prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122
13123static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013124unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013125 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013127 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013128 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013129 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013130 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013131 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132
Jesus Ceaac451502011-04-20 17:09:23 +020013133 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013134 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013135 if (PyTuple_Check(subobj)) {
13136 Py_ssize_t i;
13137 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013138 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013139 if (substring == NULL)
13140 return NULL;
13141 result = tailmatch(self, substring, start, end, -1);
13142 Py_DECREF(substring);
13143 if (result) {
13144 Py_RETURN_TRUE;
13145 }
13146 }
13147 /* nothing matched */
13148 Py_RETURN_FALSE;
13149 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013150 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013151 if (substring == NULL) {
13152 if (PyErr_ExceptionMatches(PyExc_TypeError))
13153 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13154 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013155 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013156 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013157 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013158 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013159 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013160}
13161
13162
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013163PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013164 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013165\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013166Return True if S ends with the specified suffix, False otherwise.\n\
13167With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013168With optional end, stop comparing S at that position.\n\
13169suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013170
13171static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013172unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013173 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013174{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013175 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013176 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013177 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013178 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013179 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013180
Jesus Ceaac451502011-04-20 17:09:23 +020013181 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013182 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013183 if (PyTuple_Check(subobj)) {
13184 Py_ssize_t i;
13185 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013186 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013187 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013188 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013189 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013190 result = tailmatch(self, substring, start, end, +1);
13191 Py_DECREF(substring);
13192 if (result) {
13193 Py_RETURN_TRUE;
13194 }
13195 }
13196 Py_RETURN_FALSE;
13197 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013198 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013199 if (substring == NULL) {
13200 if (PyErr_ExceptionMatches(PyExc_TypeError))
13201 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13202 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013203 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013204 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013205 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013206 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013207 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013208}
13209
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013210#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013211
13212PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013213 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013214\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013215Return a formatted version of S, using substitutions from args and kwargs.\n\
13216The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013217
Eric Smith27bbca62010-11-04 17:06:58 +000013218PyDoc_STRVAR(format_map__doc__,
13219 "S.format_map(mapping) -> str\n\
13220\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013221Return a formatted version of S, using substitutions from mapping.\n\
13222The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013223
Eric Smith4a7d76d2008-05-30 18:10:19 +000013224static PyObject *
13225unicode__format__(PyObject* self, PyObject* args)
13226{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013227 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013228
13229 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13230 return NULL;
13231
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013232 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013233 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013234 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013235}
13236
Eric Smith8c663262007-08-25 02:26:07 +000013237PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013238 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013239\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013240Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013241
13242static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013243unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013245 Py_ssize_t size;
13246
13247 /* If it's a compact object, account for base structure +
13248 character data. */
13249 if (PyUnicode_IS_COMPACT_ASCII(v))
13250 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13251 else if (PyUnicode_IS_COMPACT(v))
13252 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013253 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013254 else {
13255 /* If it is a two-block object, account for base object, and
13256 for character block if present. */
13257 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013258 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013259 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013260 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013261 }
13262 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013263 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013264 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013265 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013266 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013267 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013268
13269 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013270}
13271
13272PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013273 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013274
13275static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013276unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013277{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013278 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013279 if (!copy)
13280 return NULL;
13281 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013282}
13283
Guido van Rossumd57fd912000-03-10 22:53:23 +000013284static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013285 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013286 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013287 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13288 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013289 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13290 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013291 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013292 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13293 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13294 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13295 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13296 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013297 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013298 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13299 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13300 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013301 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013302 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13303 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13304 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013305 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013306 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013307 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013308 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013309 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13310 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13311 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13312 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13313 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13314 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13315 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13316 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13317 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13318 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13319 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13320 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13321 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13322 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013323 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013324 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013325 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013326 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013327 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013328 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013329 {"maketrans", (PyCFunction) unicode_maketrans,
13330 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013331 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013332#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013333 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013334 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013335#endif
13336
Benjamin Peterson14339b62009-01-31 16:36:08 +000013337 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013338 {NULL, NULL}
13339};
13340
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013341static PyObject *
13342unicode_mod(PyObject *v, PyObject *w)
13343{
Brian Curtindfc80e32011-08-10 20:28:54 -050013344 if (!PyUnicode_Check(v))
13345 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013346 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013347}
13348
13349static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013350 0, /*nb_add*/
13351 0, /*nb_subtract*/
13352 0, /*nb_multiply*/
13353 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013354};
13355
Guido van Rossumd57fd912000-03-10 22:53:23 +000013356static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013357 (lenfunc) unicode_length, /* sq_length */
13358 PyUnicode_Concat, /* sq_concat */
13359 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13360 (ssizeargfunc) unicode_getitem, /* sq_item */
13361 0, /* sq_slice */
13362 0, /* sq_ass_item */
13363 0, /* sq_ass_slice */
13364 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013365};
13366
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013367static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013368unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013369{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013370 if (PyUnicode_READY(self) == -1)
13371 return NULL;
13372
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013373 if (PyIndex_Check(item)) {
13374 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013375 if (i == -1 && PyErr_Occurred())
13376 return NULL;
13377 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013378 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013379 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013380 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013381 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013382 PyObject *result;
13383 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013384 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013385 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013387 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013388 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013389 return NULL;
13390 }
13391
13392 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013393 Py_INCREF(unicode_empty);
13394 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013395 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013396 slicelength == PyUnicode_GET_LENGTH(self)) {
13397 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013398 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013399 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013400 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013401 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013402 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013403 src_kind = PyUnicode_KIND(self);
13404 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013405 if (!PyUnicode_IS_ASCII(self)) {
13406 kind_limit = kind_maxchar_limit(src_kind);
13407 max_char = 0;
13408 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13409 ch = PyUnicode_READ(src_kind, src_data, cur);
13410 if (ch > max_char) {
13411 max_char = ch;
13412 if (max_char >= kind_limit)
13413 break;
13414 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013415 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013416 }
Victor Stinner55c99112011-10-13 01:17:06 +020013417 else
13418 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013419 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013420 if (result == NULL)
13421 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013422 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013423 dest_data = PyUnicode_DATA(result);
13424
13425 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013426 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13427 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013428 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013429 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013430 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013431 } else {
13432 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13433 return NULL;
13434 }
13435}
13436
13437static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013438 (lenfunc)unicode_length, /* mp_length */
13439 (binaryfunc)unicode_subscript, /* mp_subscript */
13440 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013441};
13442
Guido van Rossumd57fd912000-03-10 22:53:23 +000013443
Guido van Rossumd57fd912000-03-10 22:53:23 +000013444/* Helpers for PyUnicode_Format() */
13445
13446static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013447getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013448{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013449 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013450 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013451 (*p_argidx)++;
13452 if (arglen < 0)
13453 return args;
13454 else
13455 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013456 }
13457 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013458 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013459 return NULL;
13460}
13461
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013462/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013463
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013464static PyObject *
13465formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013466{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013467 char *p;
13468 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013469 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013470
Guido van Rossumd57fd912000-03-10 22:53:23 +000013471 x = PyFloat_AsDouble(v);
13472 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013473 return NULL;
13474
Guido van Rossumd57fd912000-03-10 22:53:23 +000013475 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013476 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013477
Eric Smith0923d1d2009-04-16 20:16:10 +000013478 p = PyOS_double_to_string(x, type, prec,
13479 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013480 if (p == NULL)
13481 return NULL;
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013482 result = unicode_fromascii((unsigned char*)p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +000013483 PyMem_Free(p);
13484 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013485}
13486
Victor Stinnerd0880d52012-04-27 23:40:13 +020013487/* formatlong() emulates the format codes d, u, o, x and X, and
13488 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13489 * Python's regular ints.
13490 * Return value: a new PyUnicodeObject*, or NULL if error.
13491 * The output string is of the form
13492 * "-"? ("0x" | "0X")? digit+
13493 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13494 * set in flags. The case of hex digits will be correct,
13495 * There will be at least prec digits, zero-filled on the left if
13496 * necessary to get that many.
13497 * val object to be converted
13498 * flags bitmask of format flags; only F_ALT is looked at
13499 * prec minimum number of digits; 0-fill on left if needed
13500 * type a character in [duoxX]; u acts the same as d
13501 *
13502 * CAUTION: o, x and X conversions on regular ints can never
13503 * produce a '-' sign, but can for Python's unbounded ints.
13504 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013505static PyObject*
13506formatlong(PyObject *val, int flags, int prec, int type)
13507{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013508 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013509 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013510 Py_ssize_t i;
13511 int sign; /* 1 if '-', else 0 */
13512 int len; /* number of characters */
13513 Py_ssize_t llen;
13514 int numdigits; /* len == numnondigits + numdigits */
13515 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013516
Victor Stinnerd0880d52012-04-27 23:40:13 +020013517 /* Avoid exceeding SSIZE_T_MAX */
13518 if (prec > INT_MAX-3) {
13519 PyErr_SetString(PyExc_OverflowError,
13520 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013521 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013522 }
13523
13524 assert(PyLong_Check(val));
13525
13526 switch (type) {
13527 case 'd':
13528 case 'u':
13529 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013530 if (PyBool_Check(val))
13531 result = PyNumber_ToBase(val, 10);
13532 else
13533 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013534 break;
13535 case 'o':
13536 numnondigits = 2;
13537 result = PyNumber_ToBase(val, 8);
13538 break;
13539 case 'x':
13540 case 'X':
13541 numnondigits = 2;
13542 result = PyNumber_ToBase(val, 16);
13543 break;
13544 default:
13545 assert(!"'type' not in [duoxX]");
13546 }
13547 if (!result)
13548 return NULL;
13549
13550 assert(unicode_modifiable(result));
13551 assert(PyUnicode_IS_READY(result));
13552 assert(PyUnicode_IS_ASCII(result));
13553
13554 /* To modify the string in-place, there can only be one reference. */
13555 if (Py_REFCNT(result) != 1) {
13556 PyErr_BadInternalCall();
13557 return NULL;
13558 }
13559 buf = PyUnicode_DATA(result);
13560 llen = PyUnicode_GET_LENGTH(result);
13561 if (llen > INT_MAX) {
13562 PyErr_SetString(PyExc_ValueError,
13563 "string too large in _PyBytes_FormatLong");
13564 return NULL;
13565 }
13566 len = (int)llen;
13567 sign = buf[0] == '-';
13568 numnondigits += sign;
13569 numdigits = len - numnondigits;
13570 assert(numdigits > 0);
13571
13572 /* Get rid of base marker unless F_ALT */
13573 if (((flags & F_ALT) == 0 &&
13574 (type == 'o' || type == 'x' || type == 'X'))) {
13575 assert(buf[sign] == '0');
13576 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13577 buf[sign+1] == 'o');
13578 numnondigits -= 2;
13579 buf += 2;
13580 len -= 2;
13581 if (sign)
13582 buf[0] = '-';
13583 assert(len == numnondigits + numdigits);
13584 assert(numdigits > 0);
13585 }
13586
13587 /* Fill with leading zeroes to meet minimum width. */
13588 if (prec > numdigits) {
13589 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13590 numnondigits + prec);
13591 char *b1;
13592 if (!r1) {
13593 Py_DECREF(result);
13594 return NULL;
13595 }
13596 b1 = PyBytes_AS_STRING(r1);
13597 for (i = 0; i < numnondigits; ++i)
13598 *b1++ = *buf++;
13599 for (i = 0; i < prec - numdigits; i++)
13600 *b1++ = '0';
13601 for (i = 0; i < numdigits; i++)
13602 *b1++ = *buf++;
13603 *b1 = '\0';
13604 Py_DECREF(result);
13605 result = r1;
13606 buf = PyBytes_AS_STRING(result);
13607 len = numnondigits + prec;
13608 }
13609
13610 /* Fix up case for hex conversions. */
13611 if (type == 'X') {
13612 /* Need to convert all lower case letters to upper case.
13613 and need to convert 0x to 0X (and -0x to -0X). */
13614 for (i = 0; i < len; i++)
13615 if (buf[i] >= 'a' && buf[i] <= 'x')
13616 buf[i] -= 'a'-'A';
13617 }
13618 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13619 PyObject *unicode;
13620 unicode = unicode_fromascii((unsigned char *)buf, len);
13621 Py_DECREF(result);
13622 result = unicode;
13623 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013624 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013625}
13626
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013627static Py_UCS4
13628formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013629{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013630 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013631 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013632 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013633 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013634 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013635 goto onError;
13636 }
13637 else {
13638 /* Integer input truncated to a character */
13639 long x;
13640 x = PyLong_AsLong(v);
13641 if (x == -1 && PyErr_Occurred())
13642 goto onError;
13643
Victor Stinner8faf8212011-12-08 22:14:11 +010013644 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013645 PyErr_SetString(PyExc_OverflowError,
13646 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013647 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013648 }
13649
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013650 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013651 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013652
Benjamin Peterson29060642009-01-31 22:14:21 +000013653 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013654 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013655 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013656 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013657}
13658
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013659struct unicode_writer_t {
13660 PyObject *buffer;
13661 void *data;
13662 enum PyUnicode_Kind kind;
13663 Py_UCS4 maxchar;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013664 Py_ssize_t pos;
13665};
13666
13667Py_LOCAL_INLINE(void)
13668unicode_writer_update(struct unicode_writer_t *writer)
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013669{
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013670 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13671 writer->data = PyUnicode_DATA(writer->buffer);
13672 writer->kind = PyUnicode_KIND(writer->buffer);
13673}
13674
13675Py_LOCAL_INLINE(int)
13676unicode_writer_init(struct unicode_writer_t *writer,
13677 Py_ssize_t length, Py_UCS4 maxchar)
13678{
13679 writer->pos = 0;
Victor Stinnerbf4e2662012-05-03 19:27:14 +020013680 writer->buffer = PyUnicode_New(length, maxchar);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013681 if (writer->buffer == NULL)
13682 return -1;
13683 unicode_writer_update(writer);
13684 return 0;
13685}
13686
13687Py_LOCAL_INLINE(int)
13688unicode_writer_prepare(struct unicode_writer_t *writer,
13689 Py_ssize_t length, Py_UCS4 maxchar)
13690{
13691 Py_ssize_t newlen;
Victor Stinner79891572012-05-03 13:43:07 +020013692 PyObject *newbuffer;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013693
13694 if (length > PY_SSIZE_T_MAX - writer->pos) {
13695 PyErr_NoMemory();
13696 return -1;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013697 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013698 newlen = writer->pos + length;
13699
Victor Stinnerbf4e2662012-05-03 19:27:14 +020013700 if (newlen > PyUnicode_GET_LENGTH(writer->buffer)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013701 /* overallocate 25% to limit the number of resize */
Victor Stinnerbf4e2662012-05-03 19:27:14 +020013702 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
13703 newlen += newlen / 4;
Victor Stinner79891572012-05-03 13:43:07 +020013704
13705 if (maxchar > writer->maxchar) {
13706 /* resize + widen */
Victor Stinnerbf4e2662012-05-03 19:27:14 +020013707 newbuffer = PyUnicode_New(newlen, maxchar);
Victor Stinner79891572012-05-03 13:43:07 +020013708 if (newbuffer == NULL)
13709 return -1;
13710 PyUnicode_CopyCharacters(newbuffer, 0,
13711 writer->buffer, 0, writer->pos);
13712 Py_DECREF(writer->buffer);
13713 }
13714 else {
Victor Stinnerbf4e2662012-05-03 19:27:14 +020013715 newbuffer = resize_compact(writer->buffer, newlen);
Victor Stinner79891572012-05-03 13:43:07 +020013716 if (newbuffer == NULL)
13717 return -1;
13718 }
13719 writer->buffer = newbuffer;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013720 unicode_writer_update(writer);
13721 }
Victor Stinner79891572012-05-03 13:43:07 +020013722 else if (maxchar > writer->maxchar) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013723 if (unicode_widen(&writer->buffer, writer->pos, maxchar) < 0)
13724 return -1;
13725 unicode_writer_update(writer);
13726 }
13727 return 0;
13728}
13729
13730Py_LOCAL_INLINE(int)
13731unicode_writer_write_str(
13732 struct unicode_writer_t *writer,
13733 PyObject *str, Py_ssize_t start, Py_ssize_t length)
13734{
13735 Py_UCS4 maxchar;
13736 maxchar = _PyUnicode_FindMaxChar(str, start, start + length);
13737 if (unicode_writer_prepare(writer, length, maxchar) == -1)
13738 return -1;
Victor Stinnerbf4e2662012-05-03 19:27:14 +020013739 assert((writer->pos + length) <= PyUnicode_GET_LENGTH(writer->buffer));
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013740 copy_characters(writer->buffer, writer->pos,
13741 str, start, length);
13742 writer->pos += length;
13743 return 0;
13744}
13745
13746Py_LOCAL_INLINE(int)
13747unicode_writer_write_char(
13748 struct unicode_writer_t *writer,
13749 Py_UCS4 ch)
13750{
13751 if (unicode_writer_prepare(writer, 1, ch) == -1)
13752 return -1;
Victor Stinnerbf4e2662012-05-03 19:27:14 +020013753 assert((writer->pos + 1) <= PyUnicode_GET_LENGTH(writer->buffer));
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013754 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13755 writer->pos += 1;
13756 return 0;
13757}
13758
13759Py_LOCAL_INLINE(void)
13760unicode_writer_dealloc(struct unicode_writer_t *writer)
13761{
13762 Py_CLEAR(writer->buffer);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013763}
13764
Alexander Belopolsky40018472011-02-26 01:02:56 +000013765PyObject *
13766PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013767{
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013768 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013769 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013770 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013771 PyObject *temp = NULL;
13772 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013773 PyObject *uformat;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013774 void *fmt;
13775 enum PyUnicode_Kind kind, fmtkind;
13776 struct unicode_writer_t writer;
Tim Petersced69f82003-09-16 20:30:58 +000013777
Guido van Rossumd57fd912000-03-10 22:53:23 +000013778 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013779 PyErr_BadInternalCall();
13780 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013781 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013782 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013783 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013784 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013785 if (PyUnicode_READY(uformat) == -1)
13786 Py_DECREF(uformat);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013788 fmt = PyUnicode_DATA(uformat);
13789 fmtkind = PyUnicode_KIND(uformat);
13790 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13791 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013792
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013793 if (unicode_writer_init(&writer, fmtcnt + 100, 127) < 0)
13794 goto onError;
13795
Guido van Rossumd57fd912000-03-10 22:53:23 +000013796 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013797 arglen = PyTuple_Size(args);
13798 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013799 }
13800 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013801 arglen = -1;
13802 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013803 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013804 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013805 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013806 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013807
13808 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013809 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013810 Py_ssize_t nonfmtpos;
13811 nonfmtpos = fmtpos++;
13812 while (fmtcnt >= 0 &&
13813 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13814 fmtpos++;
13815 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013816 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013817 if (fmtcnt < 0)
13818 fmtpos--;
13819 if (unicode_writer_write_str(&writer, uformat, nonfmtpos, fmtpos - nonfmtpos) < 0)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013820 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013821 }
13822 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013823 /* Got a format specifier */
13824 int flags = 0;
13825 Py_ssize_t width = -1;
13826 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013827 Py_UCS4 c = '\0';
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013828 Py_UCS4 fill;
13829 int sign;
13830 Py_UCS4 signchar;
Benjamin Peterson29060642009-01-31 22:14:21 +000013831 int isnumok;
13832 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013833 void *pbuf = NULL;
13834 Py_ssize_t pindex, len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013836 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013837 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13838 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013839 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013840 Py_ssize_t keylen;
13841 PyObject *key;
13842 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013843
Benjamin Peterson29060642009-01-31 22:14:21 +000013844 if (dict == NULL) {
13845 PyErr_SetString(PyExc_TypeError,
13846 "format requires a mapping");
13847 goto onError;
13848 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013849 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013850 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013851 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013852 /* Skip over balanced parentheses */
13853 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013854 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13855 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013856 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013857 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013858 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013859 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013861 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013862 if (fmtcnt < 0 || pcount > 0) {
13863 PyErr_SetString(PyExc_ValueError,
13864 "incomplete format key");
13865 goto onError;
13866 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013867 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013868 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013869 if (key == NULL)
13870 goto onError;
13871 if (args_owned) {
13872 Py_DECREF(args);
13873 args_owned = 0;
13874 }
13875 args = PyObject_GetItem(dict, key);
13876 Py_DECREF(key);
13877 if (args == NULL) {
13878 goto onError;
13879 }
13880 args_owned = 1;
13881 arglen = -1;
13882 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013883 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013884 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013885 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13886 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013887 case '-': flags |= F_LJUST; continue;
13888 case '+': flags |= F_SIGN; continue;
13889 case ' ': flags |= F_BLANK; continue;
13890 case '#': flags |= F_ALT; continue;
13891 case '0': flags |= F_ZERO; continue;
13892 }
13893 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013894 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013895 if (c == '*') {
13896 v = getnextarg(args, arglen, &argidx);
13897 if (v == NULL)
13898 goto onError;
13899 if (!PyLong_Check(v)) {
13900 PyErr_SetString(PyExc_TypeError,
13901 "* wants int");
13902 goto onError;
13903 }
13904 width = PyLong_AsLong(v);
13905 if (width == -1 && PyErr_Occurred())
13906 goto onError;
13907 if (width < 0) {
13908 flags |= F_LJUST;
13909 width = -width;
13910 }
13911 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013912 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013913 }
13914 else if (c >= '0' && c <= '9') {
13915 width = c - '0';
13916 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013917 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013918 if (c < '0' || c > '9')
13919 break;
13920 if ((width*10) / 10 != width) {
13921 PyErr_SetString(PyExc_ValueError,
13922 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013923 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013924 }
13925 width = width*10 + (c - '0');
13926 }
13927 }
13928 if (c == '.') {
13929 prec = 0;
13930 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013931 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013932 if (c == '*') {
13933 v = getnextarg(args, arglen, &argidx);
13934 if (v == NULL)
13935 goto onError;
13936 if (!PyLong_Check(v)) {
13937 PyErr_SetString(PyExc_TypeError,
13938 "* wants int");
13939 goto onError;
13940 }
13941 prec = PyLong_AsLong(v);
13942 if (prec == -1 && PyErr_Occurred())
13943 goto onError;
13944 if (prec < 0)
13945 prec = 0;
13946 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013947 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013948 }
13949 else if (c >= '0' && c <= '9') {
13950 prec = c - '0';
13951 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013952 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013953 if (c < '0' || c > '9')
13954 break;
13955 if ((prec*10) / 10 != prec) {
13956 PyErr_SetString(PyExc_ValueError,
13957 "prec too big");
13958 goto onError;
13959 }
13960 prec = prec*10 + (c - '0');
13961 }
13962 }
13963 } /* prec */
13964 if (fmtcnt >= 0) {
13965 if (c == 'h' || c == 'l' || c == 'L') {
13966 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013967 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013968 }
13969 }
13970 if (fmtcnt < 0) {
13971 PyErr_SetString(PyExc_ValueError,
13972 "incomplete format");
13973 goto onError;
13974 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013975
13976 if (c == '%') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013977 if (unicode_writer_write_char(&writer, '%') < 0)
13978 goto onError;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013979 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013980 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013981
13982
13983 v = getnextarg(args, arglen, &argidx);
13984 if (v == NULL)
13985 goto onError;
13986
Benjamin Peterson29060642009-01-31 22:14:21 +000013987 sign = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013988 signchar = '\0';
Benjamin Peterson29060642009-01-31 22:14:21 +000013989 fill = ' ';
13990 switch (c) {
13991
Benjamin Peterson29060642009-01-31 22:14:21 +000013992 case 's':
13993 case 'r':
13994 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013995 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013996 temp = v;
13997 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013998 }
13999 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000014000 if (c == 's')
14001 temp = PyObject_Str(v);
14002 else if (c == 'r')
14003 temp = PyObject_Repr(v);
14004 else
14005 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000014006 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014007 break;
14008
14009 case 'i':
14010 case 'd':
14011 case 'u':
14012 case 'o':
14013 case 'x':
14014 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000014015 isnumok = 0;
14016 if (PyNumber_Check(v)) {
14017 PyObject *iobj=NULL;
14018
14019 if (PyLong_Check(v)) {
14020 iobj = v;
14021 Py_INCREF(iobj);
14022 }
14023 else {
14024 iobj = PyNumber_Long(v);
14025 }
14026 if (iobj!=NULL) {
14027 if (PyLong_Check(iobj)) {
14028 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020014029 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070014030 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000014031 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000014032 }
14033 else {
14034 Py_DECREF(iobj);
14035 }
14036 }
14037 }
14038 if (!isnumok) {
14039 PyErr_Format(PyExc_TypeError,
14040 "%%%c format: a number is required, "
14041 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
14042 goto onError;
14043 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014044 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000014045 fill = '0';
14046 break;
14047
14048 case 'e':
14049 case 'E':
14050 case 'f':
14051 case 'F':
14052 case 'g':
14053 case 'G':
Benjamin Peterson29060642009-01-31 22:14:21 +000014054 sign = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014055 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000014056 fill = '0';
Victor Stinneraff3cc62012-04-30 05:19:21 +020014057 temp = formatfloat(v, flags, prec, c);
Benjamin Peterson29060642009-01-31 22:14:21 +000014058 break;
14059
14060 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014061 {
14062 Py_UCS4 ch = formatchar(v);
14063 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014064 goto onError;
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020014065 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000014066 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014067 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014068
14069 default:
14070 PyErr_Format(PyExc_ValueError,
14071 "unsupported format character '%c' (0x%x) "
14072 "at index %zd",
14073 (31<=c && c<=126) ? (char)c : '?',
14074 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014075 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000014076 goto onError;
14077 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014078 if (temp == NULL)
14079 goto onError;
14080 assert (PyUnicode_Check(temp));
14081 if (PyUnicode_READY(temp) == -1) {
14082 Py_CLEAR(temp);
14083 goto onError;
14084 }
14085 kind = PyUnicode_KIND(temp);
14086 pbuf = PyUnicode_DATA(temp);
14087 len = PyUnicode_GET_LENGTH(temp);
14088
14089 if (c == 's' || c == 'r' || c == 'a') {
14090 if (prec >= 0 && len > prec)
14091 len = prec;
14092 }
14093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014094 /* pbuf is initialized here. */
14095 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000014096 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014097 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14098 if (ch == '-' || ch == '+') {
14099 signchar = ch;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014100 len--;
14101 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000014102 }
14103 else if (flags & F_SIGN)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014104 signchar = '+';
Benjamin Peterson29060642009-01-31 22:14:21 +000014105 else if (flags & F_BLANK)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014106 signchar = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +000014107 else
14108 sign = 0;
14109 }
14110 if (width < len)
14111 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000014112 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014113 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014114 if (unicode_writer_write_char(&writer, signchar) < 0)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014115 goto onError;
14116 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014117 if (width > len)
14118 width--;
14119 }
14120 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014121 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014122 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000014123 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014124 if (unicode_writer_prepare(&writer, 2, 127) < 0)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014125 goto onError;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014126 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
14127 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
14128 writer.pos += 2;
14129 pindex += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +000014130 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014131 width -= 2;
14132 if (width < 0)
14133 width = 0;
14134 len -= 2;
14135 }
14136 if (width > len && !(flags & F_LJUST)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014137 Py_ssize_t sublen;
14138 sublen = width - len;
14139 if (unicode_writer_prepare(&writer, sublen, fill) < 0)
Antoine Pitrou978b9d22011-10-07 12:35:48 +020014140 goto onError;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014141 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
14142 writer.pos += sublen;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020014143 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000014144 }
14145 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014146 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014147 if (unicode_writer_write_char(&writer, signchar) < 0)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014148 goto onError;
14149 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014150 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014151 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14152 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014153
14154 if (unicode_writer_prepare(&writer, 2, 127) < 0)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014155 goto onError;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014156 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
14157 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
14158 writer.pos += 2;
14159
14160 pindex += 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014161 }
14162 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014164 /* Copy all characters, preserving len */
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014165 if (unicode_writer_write_str(&writer, temp, pindex, len) < 0)
14166 goto onError;
14167 if (width > len) {
14168 Py_ssize_t sublen = width - len;
14169 if (unicode_writer_prepare(&writer, sublen, ' ') < 0)
Victor Stinnerb80e46e2012-04-30 05:21:52 +020014170 goto onError;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014171 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
14172 writer.pos += sublen;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014173 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014174 if (dict && (argidx < arglen) && c != '%') {
14175 PyErr_SetString(PyExc_TypeError,
14176 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000014177 goto onError;
14178 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014179 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000014180 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014181 } /* until end */
14182 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014183 PyErr_SetString(PyExc_TypeError,
14184 "not all arguments converted during string formatting");
14185 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014186 }
14187
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014188 if (PyUnicode_Resize(&writer.buffer, writer.pos) < 0)
14189 goto onError;
14190
Guido van Rossumd57fd912000-03-10 22:53:23 +000014191 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014192 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014193 }
14194 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014195 Py_XDECREF(temp);
14196 Py_XDECREF(second);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014197 return writer.buffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014198
Benjamin Peterson29060642009-01-31 22:14:21 +000014199 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014200 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014201 Py_XDECREF(temp);
14202 Py_XDECREF(second);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014203 unicode_writer_dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014204 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014205 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014206 }
14207 return NULL;
14208}
14209
Jeremy Hylton938ace62002-07-17 16:30:39 +000014210static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014211unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14212
Tim Peters6d6c1a32001-08-02 04:15:00 +000014213static PyObject *
14214unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14215{
Benjamin Peterson29060642009-01-31 22:14:21 +000014216 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014217 static char *kwlist[] = {"object", "encoding", "errors", 0};
14218 char *encoding = NULL;
14219 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014220
Benjamin Peterson14339b62009-01-31 16:36:08 +000014221 if (type != &PyUnicode_Type)
14222 return unicode_subtype_new(type, args, kwds);
14223 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014224 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014225 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014226 if (x == NULL) {
14227 Py_INCREF(unicode_empty);
14228 return unicode_empty;
14229 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014230 if (encoding == NULL && errors == NULL)
14231 return PyObject_Str(x);
14232 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014233 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014234}
14235
Guido van Rossume023fe02001-08-30 03:12:59 +000014236static PyObject *
14237unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14238{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014239 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014240 Py_ssize_t length, char_size;
14241 int share_wstr, share_utf8;
14242 unsigned int kind;
14243 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014244
Benjamin Peterson14339b62009-01-31 16:36:08 +000014245 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014246
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014247 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014248 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014249 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014250 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014251 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014252 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014253 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014254 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014255
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014256 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014257 if (self == NULL) {
14258 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014259 return NULL;
14260 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014261 kind = PyUnicode_KIND(unicode);
14262 length = PyUnicode_GET_LENGTH(unicode);
14263
14264 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014265#ifdef Py_DEBUG
14266 _PyUnicode_HASH(self) = -1;
14267#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014268 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014269#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014270 _PyUnicode_STATE(self).interned = 0;
14271 _PyUnicode_STATE(self).kind = kind;
14272 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014273 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014274 _PyUnicode_STATE(self).ready = 1;
14275 _PyUnicode_WSTR(self) = NULL;
14276 _PyUnicode_UTF8_LENGTH(self) = 0;
14277 _PyUnicode_UTF8(self) = NULL;
14278 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014279 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014280
14281 share_utf8 = 0;
14282 share_wstr = 0;
14283 if (kind == PyUnicode_1BYTE_KIND) {
14284 char_size = 1;
14285 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14286 share_utf8 = 1;
14287 }
14288 else if (kind == PyUnicode_2BYTE_KIND) {
14289 char_size = 2;
14290 if (sizeof(wchar_t) == 2)
14291 share_wstr = 1;
14292 }
14293 else {
14294 assert(kind == PyUnicode_4BYTE_KIND);
14295 char_size = 4;
14296 if (sizeof(wchar_t) == 4)
14297 share_wstr = 1;
14298 }
14299
14300 /* Ensure we won't overflow the length. */
14301 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14302 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014303 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014304 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014305 data = PyObject_MALLOC((length + 1) * char_size);
14306 if (data == NULL) {
14307 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014308 goto onError;
14309 }
14310
Victor Stinnerc3c74152011-10-02 20:39:55 +020014311 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014312 if (share_utf8) {
14313 _PyUnicode_UTF8_LENGTH(self) = length;
14314 _PyUnicode_UTF8(self) = data;
14315 }
14316 if (share_wstr) {
14317 _PyUnicode_WSTR_LENGTH(self) = length;
14318 _PyUnicode_WSTR(self) = (wchar_t *)data;
14319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014320
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014321 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014322 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014323 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014324#ifdef Py_DEBUG
14325 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14326#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014327 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014328 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014329
14330onError:
14331 Py_DECREF(unicode);
14332 Py_DECREF(self);
14333 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014334}
14335
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014336PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000014337 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014338\n\
Collin Winterd474ce82007-08-07 19:42:11 +000014339Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000014340encoding defaults to the current default string encoding.\n\
14341errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014342
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014343static PyObject *unicode_iter(PyObject *seq);
14344
Guido van Rossumd57fd912000-03-10 22:53:23 +000014345PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014346 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014347 "str", /* tp_name */
14348 sizeof(PyUnicodeObject), /* tp_size */
14349 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014350 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014351 (destructor)unicode_dealloc, /* tp_dealloc */
14352 0, /* tp_print */
14353 0, /* tp_getattr */
14354 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014355 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014356 unicode_repr, /* tp_repr */
14357 &unicode_as_number, /* tp_as_number */
14358 &unicode_as_sequence, /* tp_as_sequence */
14359 &unicode_as_mapping, /* tp_as_mapping */
14360 (hashfunc) unicode_hash, /* tp_hash*/
14361 0, /* tp_call*/
14362 (reprfunc) unicode_str, /* tp_str */
14363 PyObject_GenericGetAttr, /* tp_getattro */
14364 0, /* tp_setattro */
14365 0, /* tp_as_buffer */
14366 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014367 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014368 unicode_doc, /* tp_doc */
14369 0, /* tp_traverse */
14370 0, /* tp_clear */
14371 PyUnicode_RichCompare, /* tp_richcompare */
14372 0, /* tp_weaklistoffset */
14373 unicode_iter, /* tp_iter */
14374 0, /* tp_iternext */
14375 unicode_methods, /* tp_methods */
14376 0, /* tp_members */
14377 0, /* tp_getset */
14378 &PyBaseObject_Type, /* tp_base */
14379 0, /* tp_dict */
14380 0, /* tp_descr_get */
14381 0, /* tp_descr_set */
14382 0, /* tp_dictoffset */
14383 0, /* tp_init */
14384 0, /* tp_alloc */
14385 unicode_new, /* tp_new */
14386 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014387};
14388
14389/* Initialize the Unicode implementation */
14390
Victor Stinner3a50e702011-10-18 21:21:00 +020014391int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014392{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014393 int i;
14394
Thomas Wouters477c8d52006-05-27 19:21:47 +000014395 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014396 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014397 0x000A, /* LINE FEED */
14398 0x000D, /* CARRIAGE RETURN */
14399 0x001C, /* FILE SEPARATOR */
14400 0x001D, /* GROUP SEPARATOR */
14401 0x001E, /* RECORD SEPARATOR */
14402 0x0085, /* NEXT LINE */
14403 0x2028, /* LINE SEPARATOR */
14404 0x2029, /* PARAGRAPH SEPARATOR */
14405 };
14406
Fred Drakee4315f52000-05-09 19:53:39 +000014407 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014408 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014409 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014410 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014411 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014412
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014413 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014414 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014415 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014416 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014417
14418 /* initialize the linebreak bloom filter */
14419 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014420 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014421 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014422
14423 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014424
14425#ifdef HAVE_MBCS
14426 winver.dwOSVersionInfoSize = sizeof(winver);
14427 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14428 PyErr_SetFromWindowsErr(0);
14429 return -1;
14430 }
14431#endif
14432 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014433}
14434
14435/* Finalize the Unicode implementation */
14436
Christian Heimesa156e092008-02-16 07:38:31 +000014437int
14438PyUnicode_ClearFreeList(void)
14439{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014440 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014441}
14442
Guido van Rossumd57fd912000-03-10 22:53:23 +000014443void
Thomas Wouters78890102000-07-22 19:25:51 +000014444_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014445{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014446 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014447
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014448 Py_XDECREF(unicode_empty);
14449 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014450
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014451 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014452 if (unicode_latin1[i]) {
14453 Py_DECREF(unicode_latin1[i]);
14454 unicode_latin1[i] = NULL;
14455 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014456 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014457 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014458 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014459}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014460
Walter Dörwald16807132007-05-25 13:52:07 +000014461void
14462PyUnicode_InternInPlace(PyObject **p)
14463{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014464 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014465 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014466#ifdef Py_DEBUG
14467 assert(s != NULL);
14468 assert(_PyUnicode_CHECK(s));
14469#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014470 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014471 return;
14472#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014473 /* If it's a subclass, we don't really know what putting
14474 it in the interned dict might do. */
14475 if (!PyUnicode_CheckExact(s))
14476 return;
14477 if (PyUnicode_CHECK_INTERNED(s))
14478 return;
14479 if (interned == NULL) {
14480 interned = PyDict_New();
14481 if (interned == NULL) {
14482 PyErr_Clear(); /* Don't leave an exception */
14483 return;
14484 }
14485 }
14486 /* It might be that the GetItem call fails even
14487 though the key is present in the dictionary,
14488 namely when this happens during a stack overflow. */
14489 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014490 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014491 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014492
Benjamin Peterson29060642009-01-31 22:14:21 +000014493 if (t) {
14494 Py_INCREF(t);
14495 Py_DECREF(*p);
14496 *p = t;
14497 return;
14498 }
Walter Dörwald16807132007-05-25 13:52:07 +000014499
Benjamin Peterson14339b62009-01-31 16:36:08 +000014500 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014501 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014502 PyErr_Clear();
14503 PyThreadState_GET()->recursion_critical = 0;
14504 return;
14505 }
14506 PyThreadState_GET()->recursion_critical = 0;
14507 /* The two references in interned are not counted by refcnt.
14508 The deallocator will take care of this */
14509 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014510 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014511}
14512
14513void
14514PyUnicode_InternImmortal(PyObject **p)
14515{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014516 PyUnicode_InternInPlace(p);
14517 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014518 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014519 Py_INCREF(*p);
14520 }
Walter Dörwald16807132007-05-25 13:52:07 +000014521}
14522
14523PyObject *
14524PyUnicode_InternFromString(const char *cp)
14525{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014526 PyObject *s = PyUnicode_FromString(cp);
14527 if (s == NULL)
14528 return NULL;
14529 PyUnicode_InternInPlace(&s);
14530 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014531}
14532
Alexander Belopolsky40018472011-02-26 01:02:56 +000014533void
14534_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014535{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014536 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014537 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014538 Py_ssize_t i, n;
14539 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014540
Benjamin Peterson14339b62009-01-31 16:36:08 +000014541 if (interned == NULL || !PyDict_Check(interned))
14542 return;
14543 keys = PyDict_Keys(interned);
14544 if (keys == NULL || !PyList_Check(keys)) {
14545 PyErr_Clear();
14546 return;
14547 }
Walter Dörwald16807132007-05-25 13:52:07 +000014548
Benjamin Peterson14339b62009-01-31 16:36:08 +000014549 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14550 detector, interned unicode strings are not forcibly deallocated;
14551 rather, we give them their stolen references back, and then clear
14552 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014553
Benjamin Peterson14339b62009-01-31 16:36:08 +000014554 n = PyList_GET_SIZE(keys);
14555 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014556 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014557 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014558 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014559 if (PyUnicode_READY(s) == -1) {
14560 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014561 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014562 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014563 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014564 case SSTATE_NOT_INTERNED:
14565 /* XXX Shouldn't happen */
14566 break;
14567 case SSTATE_INTERNED_IMMORTAL:
14568 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014569 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014570 break;
14571 case SSTATE_INTERNED_MORTAL:
14572 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014573 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014574 break;
14575 default:
14576 Py_FatalError("Inconsistent interned string state.");
14577 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014578 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014579 }
14580 fprintf(stderr, "total size of all interned strings: "
14581 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14582 "mortal/immortal\n", mortal_size, immortal_size);
14583 Py_DECREF(keys);
14584 PyDict_Clear(interned);
14585 Py_DECREF(interned);
14586 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014587}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014588
14589
14590/********************* Unicode Iterator **************************/
14591
14592typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014593 PyObject_HEAD
14594 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014595 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014596} unicodeiterobject;
14597
14598static void
14599unicodeiter_dealloc(unicodeiterobject *it)
14600{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014601 _PyObject_GC_UNTRACK(it);
14602 Py_XDECREF(it->it_seq);
14603 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014604}
14605
14606static int
14607unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14608{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014609 Py_VISIT(it->it_seq);
14610 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014611}
14612
14613static PyObject *
14614unicodeiter_next(unicodeiterobject *it)
14615{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014616 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014617
Benjamin Peterson14339b62009-01-31 16:36:08 +000014618 assert(it != NULL);
14619 seq = it->it_seq;
14620 if (seq == NULL)
14621 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014622 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014624 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14625 int kind = PyUnicode_KIND(seq);
14626 void *data = PyUnicode_DATA(seq);
14627 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14628 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014629 if (item != NULL)
14630 ++it->it_index;
14631 return item;
14632 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014633
Benjamin Peterson14339b62009-01-31 16:36:08 +000014634 Py_DECREF(seq);
14635 it->it_seq = NULL;
14636 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014637}
14638
14639static PyObject *
14640unicodeiter_len(unicodeiterobject *it)
14641{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014642 Py_ssize_t len = 0;
14643 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014644 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014645 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014646}
14647
14648PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14649
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014650static PyObject *
14651unicodeiter_reduce(unicodeiterobject *it)
14652{
14653 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014654 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014655 it->it_seq, it->it_index);
14656 } else {
14657 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14658 if (u == NULL)
14659 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014660 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014661 }
14662}
14663
14664PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14665
14666static PyObject *
14667unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14668{
14669 Py_ssize_t index = PyLong_AsSsize_t(state);
14670 if (index == -1 && PyErr_Occurred())
14671 return NULL;
14672 if (index < 0)
14673 index = 0;
14674 it->it_index = index;
14675 Py_RETURN_NONE;
14676}
14677
14678PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14679
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014680static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014681 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014682 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014683 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14684 reduce_doc},
14685 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14686 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014687 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014688};
14689
14690PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014691 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14692 "str_iterator", /* tp_name */
14693 sizeof(unicodeiterobject), /* tp_basicsize */
14694 0, /* tp_itemsize */
14695 /* methods */
14696 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14697 0, /* tp_print */
14698 0, /* tp_getattr */
14699 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014700 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014701 0, /* tp_repr */
14702 0, /* tp_as_number */
14703 0, /* tp_as_sequence */
14704 0, /* tp_as_mapping */
14705 0, /* tp_hash */
14706 0, /* tp_call */
14707 0, /* tp_str */
14708 PyObject_GenericGetAttr, /* tp_getattro */
14709 0, /* tp_setattro */
14710 0, /* tp_as_buffer */
14711 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14712 0, /* tp_doc */
14713 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14714 0, /* tp_clear */
14715 0, /* tp_richcompare */
14716 0, /* tp_weaklistoffset */
14717 PyObject_SelfIter, /* tp_iter */
14718 (iternextfunc)unicodeiter_next, /* tp_iternext */
14719 unicodeiter_methods, /* tp_methods */
14720 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014721};
14722
14723static PyObject *
14724unicode_iter(PyObject *seq)
14725{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014726 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014727
Benjamin Peterson14339b62009-01-31 16:36:08 +000014728 if (!PyUnicode_Check(seq)) {
14729 PyErr_BadInternalCall();
14730 return NULL;
14731 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014732 if (PyUnicode_READY(seq) == -1)
14733 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014734 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14735 if (it == NULL)
14736 return NULL;
14737 it->it_index = 0;
14738 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014739 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014740 _PyObject_GC_TRACK(it);
14741 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014742}
14743
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014744
14745size_t
14746Py_UNICODE_strlen(const Py_UNICODE *u)
14747{
14748 int res = 0;
14749 while(*u++)
14750 res++;
14751 return res;
14752}
14753
14754Py_UNICODE*
14755Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14756{
14757 Py_UNICODE *u = s1;
14758 while ((*u++ = *s2++));
14759 return s1;
14760}
14761
14762Py_UNICODE*
14763Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14764{
14765 Py_UNICODE *u = s1;
14766 while ((*u++ = *s2++))
14767 if (n-- == 0)
14768 break;
14769 return s1;
14770}
14771
14772Py_UNICODE*
14773Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14774{
14775 Py_UNICODE *u1 = s1;
14776 u1 += Py_UNICODE_strlen(u1);
14777 Py_UNICODE_strcpy(u1, s2);
14778 return s1;
14779}
14780
14781int
14782Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14783{
14784 while (*s1 && *s2 && *s1 == *s2)
14785 s1++, s2++;
14786 if (*s1 && *s2)
14787 return (*s1 < *s2) ? -1 : +1;
14788 if (*s1)
14789 return 1;
14790 if (*s2)
14791 return -1;
14792 return 0;
14793}
14794
14795int
14796Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14797{
14798 register Py_UNICODE u1, u2;
14799 for (; n != 0; n--) {
14800 u1 = *s1;
14801 u2 = *s2;
14802 if (u1 != u2)
14803 return (u1 < u2) ? -1 : +1;
14804 if (u1 == '\0')
14805 return 0;
14806 s1++;
14807 s2++;
14808 }
14809 return 0;
14810}
14811
14812Py_UNICODE*
14813Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14814{
14815 const Py_UNICODE *p;
14816 for (p = s; *p; p++)
14817 if (*p == c)
14818 return (Py_UNICODE*)p;
14819 return NULL;
14820}
14821
14822Py_UNICODE*
14823Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14824{
14825 const Py_UNICODE *p;
14826 p = s + Py_UNICODE_strlen(s);
14827 while (p != s) {
14828 p--;
14829 if (*p == c)
14830 return (Py_UNICODE*)p;
14831 }
14832 return NULL;
14833}
Victor Stinner331ea922010-08-10 16:37:20 +000014834
Victor Stinner71133ff2010-09-01 23:43:53 +000014835Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014836PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014837{
Victor Stinner577db2c2011-10-11 22:12:48 +020014838 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014839 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014840
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014841 if (!PyUnicode_Check(unicode)) {
14842 PyErr_BadArgument();
14843 return NULL;
14844 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014845 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014846 if (u == NULL)
14847 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014848 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014849 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014850 PyErr_NoMemory();
14851 return NULL;
14852 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014853 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014854 size *= sizeof(Py_UNICODE);
14855 copy = PyMem_Malloc(size);
14856 if (copy == NULL) {
14857 PyErr_NoMemory();
14858 return NULL;
14859 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014860 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014861 return copy;
14862}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014863
Georg Brandl66c221e2010-10-14 07:04:07 +000014864/* A _string module, to export formatter_parser and formatter_field_name_split
14865 to the string.Formatter class implemented in Python. */
14866
14867static PyMethodDef _string_methods[] = {
14868 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14869 METH_O, PyDoc_STR("split the argument as a field name")},
14870 {"formatter_parser", (PyCFunction) formatter_parser,
14871 METH_O, PyDoc_STR("parse the argument as a format string")},
14872 {NULL, NULL}
14873};
14874
14875static struct PyModuleDef _string_module = {
14876 PyModuleDef_HEAD_INIT,
14877 "_string",
14878 PyDoc_STR("string helper module"),
14879 0,
14880 _string_methods,
14881 NULL,
14882 NULL,
14883 NULL,
14884 NULL
14885};
14886
14887PyMODINIT_FUNC
14888PyInit__string(void)
14889{
14890 return PyModule_Create(&_string_module);
14891}
14892
14893
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014894#ifdef __cplusplus
14895}
14896#endif