blob: 8fbc203c53029061a54798ee31020e880d7f7616 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinnere6abb482012-05-02 01:15:40 +0200114/* Optimized version of Py_MAX() to compute the maximum character:
115 use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2) \
117 ((maxchar1) | (maxchar2))
118
Victor Stinner910337b2011-10-03 03:20:16 +0200119#undef PyUnicode_READY
120#define PyUnicode_READY(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200123 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100124 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
182static PyObject *interned;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200185static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000186
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* Single character Unicode strings in the Latin-1 range are being
191 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200192static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193
Christian Heimes190d79e2008-01-30 11:58:22 +0000194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000199/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000200/* case 0x000C: * FORM FEED */
201/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 1, 1, 1, 1, 1, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x001C: * FILE SEPARATOR */
205/* case 0x001D: * GROUP SEPARATOR */
206/* case 0x001E: * RECORD SEPARATOR */
207/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 1, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000214
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000223};
224
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200225/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200227static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200228static void copy_characters(
229 PyObject *to, Py_ssize_t to_start,
230 PyObject *from, Py_ssize_t from_start,
231 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100232static int unicode_modifiable(PyObject *unicode);
233
Victor Stinnerfe226c02011-10-03 03:52:20 +0200234
Alexander Belopolsky40018472011-02-26 01:02:56 +0000235static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200236unicode_fromascii(const unsigned char *s, Py_ssize_t size);
237static PyObject *
238_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
239static PyObject *
240_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
243
244static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000246 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100247 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000248 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
249
Alexander Belopolsky40018472011-02-26 01:02:56 +0000250static void
251raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300252 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100253 PyObject *unicode,
254 Py_ssize_t startpos, Py_ssize_t endpos,
255 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000256
Christian Heimes190d79e2008-01-30 11:58:22 +0000257/* Same for linebreaks */
258static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000261/* 0x000B, * LINE TABULATION */
262/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000263/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000264 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x001C, * FILE SEPARATOR */
267/* 0x001D, * GROUP SEPARATOR */
268/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 1, 1, 1, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000274
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000283};
284
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300285/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
286 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000288PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000289{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000290#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000292#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000293 /* This is actually an illegal character, so it should
294 not be passed to unichr. */
295 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296#endif
297}
298
Victor Stinner910337b2011-10-03 03:20:16 +0200299#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200300int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100301_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200302{
303 PyASCIIObject *ascii;
304 unsigned int kind;
305
306 assert(PyUnicode_Check(op));
307
308 ascii = (PyASCIIObject *)op;
309 kind = ascii->state.kind;
310
Victor Stinnera3b334d2011-10-03 13:53:37 +0200311 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200312 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(ascii->state.ready == 1);
314 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200315 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200316 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200317 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200318
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 if (ascii->state.compact == 1) {
320 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(kind == PyUnicode_1BYTE_KIND
322 || kind == PyUnicode_2BYTE_KIND
323 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200326 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 }
328 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
330
331 data = unicode->data.any;
332 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->length == 0);
334 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200335 assert(ascii->state.compact == 0);
336 assert(ascii->state.ascii == 0);
337 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100338 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 assert(ascii->wstr != NULL);
340 assert(data == NULL);
341 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200342 }
343 else {
344 assert(kind == PyUnicode_1BYTE_KIND
345 || kind == PyUnicode_2BYTE_KIND
346 || kind == PyUnicode_4BYTE_KIND);
347 assert(ascii->state.compact == 0);
348 assert(ascii->state.ready == 1);
349 assert(data != NULL);
350 if (ascii->state.ascii) {
351 assert (compact->utf8 == data);
352 assert (compact->utf8_length == ascii->length);
353 }
354 else
355 assert (compact->utf8 != data);
356 }
357 }
358 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200359 if (
360#if SIZEOF_WCHAR_T == 2
361 kind == PyUnicode_2BYTE_KIND
362#else
363 kind == PyUnicode_4BYTE_KIND
364#endif
365 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200366 {
367 assert(ascii->wstr == data);
368 assert(compact->wstr_length == ascii->length);
369 } else
370 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200372
373 if (compact->utf8 == NULL)
374 assert(compact->utf8_length == 0);
375 if (ascii->wstr == NULL)
376 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200377 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200378 /* check that the best kind is used */
379 if (check_content && kind != PyUnicode_WCHAR_KIND)
380 {
381 Py_ssize_t i;
382 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200383 void *data;
384 Py_UCS4 ch;
385
386 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 for (i=0; i < ascii->length; i++)
388 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200389 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 if (ch > maxchar)
391 maxchar = ch;
392 }
393 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100394 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100396 assert(maxchar <= 255);
397 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 else
399 assert(maxchar < 128);
400 }
Victor Stinner77faf692011-11-20 18:56:05 +0100401 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200402 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100403 assert(maxchar <= 0xFFFF);
404 }
405 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100407 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100408 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200409 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400411 return 1;
412}
Victor Stinner910337b2011-10-03 03:20:16 +0200413#endif
414
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100415static PyObject*
416unicode_result_wchar(PyObject *unicode)
417{
418#ifndef Py_DEBUG
419 Py_ssize_t len;
420
421 assert(Py_REFCNT(unicode) == 1);
422
423 len = _PyUnicode_WSTR_LENGTH(unicode);
424 if (len == 0) {
425 Py_INCREF(unicode_empty);
426 Py_DECREF(unicode);
427 return unicode_empty;
428 }
429
430 if (len == 1) {
431 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
432 if (ch < 256) {
433 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
434 Py_DECREF(unicode);
435 return latin1_char;
436 }
437 }
438
439 if (_PyUnicode_Ready(unicode) < 0) {
440 Py_XDECREF(unicode);
441 return NULL;
442 }
443#else
444 /* don't make the result ready in debug mode to ensure that the caller
445 makes the string ready before using it */
446 assert(_PyUnicode_CheckConsistency(unicode, 1));
447#endif
448 return unicode;
449}
450
451static PyObject*
452unicode_result_ready(PyObject *unicode)
453{
454 Py_ssize_t length;
455
456 length = PyUnicode_GET_LENGTH(unicode);
457 if (length == 0) {
458 if (unicode != unicode_empty) {
459 Py_INCREF(unicode_empty);
460 Py_DECREF(unicode);
461 }
462 return unicode_empty;
463 }
464
465 if (length == 1) {
466 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
467 if (ch < 256) {
468 PyObject *latin1_char = unicode_latin1[ch];
469 if (latin1_char != NULL) {
470 if (unicode != latin1_char) {
471 Py_INCREF(latin1_char);
472 Py_DECREF(unicode);
473 }
474 return latin1_char;
475 }
476 else {
477 assert(_PyUnicode_CheckConsistency(unicode, 1));
478 Py_INCREF(unicode);
479 unicode_latin1[ch] = unicode;
480 return unicode;
481 }
482 }
483 }
484
485 assert(_PyUnicode_CheckConsistency(unicode, 1));
486 return unicode;
487}
488
489static PyObject*
490unicode_result(PyObject *unicode)
491{
492 assert(_PyUnicode_CHECK(unicode));
493 if (PyUnicode_IS_READY(unicode))
494 return unicode_result_ready(unicode);
495 else
496 return unicode_result_wchar(unicode);
497}
498
Victor Stinnerc4b49542011-12-11 22:44:26 +0100499static PyObject*
500unicode_result_unchanged(PyObject *unicode)
501{
502 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500503 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100504 return NULL;
505 Py_INCREF(unicode);
506 return unicode;
507 }
508 else
509 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100510 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100511}
512
Victor Stinner3a50e702011-10-18 21:21:00 +0200513#ifdef HAVE_MBCS
514static OSVERSIONINFOEX winver;
515#endif
516
Thomas Wouters477c8d52006-05-27 19:21:47 +0000517/* --- Bloom Filters ----------------------------------------------------- */
518
519/* stuff to implement simple "bloom filters" for Unicode characters.
520 to keep things simple, we use a single bitmask, using the least 5
521 bits from each unicode characters as the bit index. */
522
523/* the linebreak mask is set up by Unicode_Init below */
524
Antoine Pitrouf068f942010-01-13 14:19:12 +0000525#if LONG_BIT >= 128
526#define BLOOM_WIDTH 128
527#elif LONG_BIT >= 64
528#define BLOOM_WIDTH 64
529#elif LONG_BIT >= 32
530#define BLOOM_WIDTH 32
531#else
532#error "LONG_BIT is smaller than 32"
533#endif
534
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535#define BLOOM_MASK unsigned long
536
537static BLOOM_MASK bloom_linebreak;
538
Antoine Pitrouf068f942010-01-13 14:19:12 +0000539#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
540#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000541
Benjamin Peterson29060642009-01-31 22:14:21 +0000542#define BLOOM_LINEBREAK(ch) \
543 ((ch) < 128U ? ascii_linebreak[(ch)] : \
544 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Alexander Belopolsky40018472011-02-26 01:02:56 +0000546Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548{
549 /* calculate simple bloom-style bitmask for a given unicode string */
550
Antoine Pitrouf068f942010-01-13 14:19:12 +0000551 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000552 Py_ssize_t i;
553
554 mask = 0;
555 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200556 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000557
558 return mask;
559}
560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561#define BLOOM_MEMBER(mask, chr, str) \
562 (BLOOM(mask, chr) \
563 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000564
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200565/* Compilation of templated routines */
566
567#include "stringlib/asciilib.h"
568#include "stringlib/fastsearch.h"
569#include "stringlib/partition.h"
570#include "stringlib/split.h"
571#include "stringlib/count.h"
572#include "stringlib/find.h"
573#include "stringlib/find_max_char.h"
574#include "stringlib/localeutil.h"
575#include "stringlib/undef.h"
576
577#include "stringlib/ucs1lib.h"
578#include "stringlib/fastsearch.h"
579#include "stringlib/partition.h"
580#include "stringlib/split.h"
581#include "stringlib/count.h"
582#include "stringlib/find.h"
583#include "stringlib/find_max_char.h"
584#include "stringlib/localeutil.h"
585#include "stringlib/undef.h"
586
587#include "stringlib/ucs2lib.h"
588#include "stringlib/fastsearch.h"
589#include "stringlib/partition.h"
590#include "stringlib/split.h"
591#include "stringlib/count.h"
592#include "stringlib/find.h"
593#include "stringlib/find_max_char.h"
594#include "stringlib/localeutil.h"
595#include "stringlib/undef.h"
596
597#include "stringlib/ucs4lib.h"
598#include "stringlib/fastsearch.h"
599#include "stringlib/partition.h"
600#include "stringlib/split.h"
601#include "stringlib/count.h"
602#include "stringlib/find.h"
603#include "stringlib/find_max_char.h"
604#include "stringlib/localeutil.h"
605#include "stringlib/undef.h"
606
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200607#include "stringlib/unicodedefs.h"
608#include "stringlib/fastsearch.h"
609#include "stringlib/count.h"
610#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100611#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612
Guido van Rossumd57fd912000-03-10 22:53:23 +0000613/* --- Unicode Object ----------------------------------------------------- */
614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200616fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200617
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200618Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
619 Py_ssize_t size, Py_UCS4 ch,
620 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200621{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200622 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
623
624 switch (kind) {
625 case PyUnicode_1BYTE_KIND:
626 {
627 Py_UCS1 ch1 = (Py_UCS1) ch;
628 if (ch1 == ch)
629 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
630 else
631 return -1;
632 }
633 case PyUnicode_2BYTE_KIND:
634 {
635 Py_UCS2 ch2 = (Py_UCS2) ch;
636 if (ch2 == ch)
637 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
638 else
639 return -1;
640 }
641 case PyUnicode_4BYTE_KIND:
642 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
643 default:
644 assert(0);
645 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647}
648
Victor Stinnerfe226c02011-10-03 03:52:20 +0200649static PyObject*
650resize_compact(PyObject *unicode, Py_ssize_t length)
651{
652 Py_ssize_t char_size;
653 Py_ssize_t struct_size;
654 Py_ssize_t new_size;
655 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100656 PyObject *new_unicode;
Victor Stinner79891572012-05-03 13:43:07 +0200657 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100659 assert(PyUnicode_IS_COMPACT(unicode));
660
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200661 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100662 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200663 struct_size = sizeof(PyASCIIObject);
664 else
665 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200666 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200667
Victor Stinnerfe226c02011-10-03 03:52:20 +0200668 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
669 PyErr_NoMemory();
670 return NULL;
671 }
672 new_size = (struct_size + (length + 1) * char_size);
673
Victor Stinner84def372011-12-11 20:04:56 +0100674 _Py_DEC_REFTOTAL;
675 _Py_ForgetReference(unicode);
676
677 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
678 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100679 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680 PyErr_NoMemory();
681 return NULL;
682 }
Victor Stinner84def372011-12-11 20:04:56 +0100683 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100685
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200687 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200688 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100689 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200690 _PyUnicode_WSTR_LENGTH(unicode) = length;
691 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
693 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200694 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200695 return unicode;
696}
697
Alexander Belopolsky40018472011-02-26 01:02:56 +0000698static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200699resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000700{
Victor Stinner95663112011-10-04 01:03:50 +0200701 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100702 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200703 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000705
Victor Stinnerfe226c02011-10-03 03:52:20 +0200706 if (PyUnicode_IS_READY(unicode)) {
707 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200708 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 void *data;
710
711 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200712 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200713 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
714 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715
716 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
717 PyErr_NoMemory();
718 return -1;
719 }
720 new_size = (length + 1) * char_size;
721
Victor Stinner7a9105a2011-12-12 00:13:42 +0100722 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
723 {
724 PyObject_DEL(_PyUnicode_UTF8(unicode));
725 _PyUnicode_UTF8(unicode) = NULL;
726 _PyUnicode_UTF8_LENGTH(unicode) = 0;
727 }
728
Victor Stinnerfe226c02011-10-03 03:52:20 +0200729 data = (PyObject *)PyObject_REALLOC(data, new_size);
730 if (data == NULL) {
731 PyErr_NoMemory();
732 return -1;
733 }
734 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200735 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200737 _PyUnicode_WSTR_LENGTH(unicode) = length;
738 }
739 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200740 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200741 _PyUnicode_UTF8_LENGTH(unicode) = length;
742 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200743 _PyUnicode_LENGTH(unicode) = length;
744 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200745 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200746 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200747 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200748 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200749 }
Victor Stinner95663112011-10-04 01:03:50 +0200750 assert(_PyUnicode_WSTR(unicode) != NULL);
751
752 /* check for integer overflow */
753 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
754 PyErr_NoMemory();
755 return -1;
756 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100757 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200758 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100759 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200760 if (!wstr) {
761 PyErr_NoMemory();
762 return -1;
763 }
764 _PyUnicode_WSTR(unicode) = wstr;
765 _PyUnicode_WSTR(unicode)[length] = 0;
766 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200767 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000768 return 0;
769}
770
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771static PyObject*
772resize_copy(PyObject *unicode, Py_ssize_t length)
773{
774 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100775 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100777
Benjamin Petersonbac79492012-01-14 13:34:47 -0500778 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100779 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200780
781 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
782 if (copy == NULL)
783 return NULL;
784
785 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200786 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200787 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200788 }
789 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200790 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100791
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200792 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200793 if (w == NULL)
794 return NULL;
795 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
796 copy_length = Py_MIN(copy_length, length);
797 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
798 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200799 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200800 }
801}
802
Guido van Rossumd57fd912000-03-10 22:53:23 +0000803/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000804 Ux0000 terminated; some code (e.g. new_identifier)
805 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000806
807 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000808 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809
810*/
811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200812#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200813static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200814#endif
815
Alexander Belopolsky40018472011-02-26 01:02:56 +0000816static PyUnicodeObject *
817_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000818{
819 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200820 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821
Thomas Wouters477c8d52006-05-27 19:21:47 +0000822 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823 if (length == 0 && unicode_empty != NULL) {
824 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200825 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000826 }
827
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000828 /* Ensure we won't overflow the size. */
829 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
830 return (PyUnicodeObject *)PyErr_NoMemory();
831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200832 if (length < 0) {
833 PyErr_SetString(PyExc_SystemError,
834 "Negative size passed to _PyUnicode_New");
835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000836 }
837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200838#ifdef Py_DEBUG
839 ++unicode_old_new_calls;
840#endif
841
842 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
843 if (unicode == NULL)
844 return NULL;
845 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
846 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
847 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100848 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000849 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100850 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000851 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200852
Jeremy Hyltond8082792003-09-16 19:41:39 +0000853 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000854 * the caller fails before initializing str -- unicode_resize()
855 * reads str[0], and the Keep-Alive optimization can keep memory
856 * allocated for str alive across a call to unicode_dealloc(unicode).
857 * We don't want unicode_resize to read uninitialized memory in
858 * that case.
859 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200860 _PyUnicode_WSTR(unicode)[0] = 0;
861 _PyUnicode_WSTR(unicode)[length] = 0;
862 _PyUnicode_WSTR_LENGTH(unicode) = length;
863 _PyUnicode_HASH(unicode) = -1;
864 _PyUnicode_STATE(unicode).interned = 0;
865 _PyUnicode_STATE(unicode).kind = 0;
866 _PyUnicode_STATE(unicode).compact = 0;
867 _PyUnicode_STATE(unicode).ready = 0;
868 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200869 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200870 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200871 _PyUnicode_UTF8(unicode) = NULL;
872 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100873 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000874 return unicode;
875}
876
Victor Stinnerf42dc442011-10-02 23:33:16 +0200877static const char*
878unicode_kind_name(PyObject *unicode)
879{
Victor Stinner42dfd712011-10-03 14:41:45 +0200880 /* don't check consistency: unicode_kind_name() is called from
881 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200882 if (!PyUnicode_IS_COMPACT(unicode))
883 {
884 if (!PyUnicode_IS_READY(unicode))
885 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600886 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887 {
888 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200889 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200890 return "legacy ascii";
891 else
892 return "legacy latin1";
893 case PyUnicode_2BYTE_KIND:
894 return "legacy UCS2";
895 case PyUnicode_4BYTE_KIND:
896 return "legacy UCS4";
897 default:
898 return "<legacy invalid kind>";
899 }
900 }
901 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600902 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200903 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200904 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200905 return "ascii";
906 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200907 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200908 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200909 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200910 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200911 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200912 default:
913 return "<invalid compact kind>";
914 }
915}
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200918static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200919
920/* Functions wrapping macros for use in debugger */
921char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200922 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923}
924
925void *_PyUnicode_compact_data(void *unicode) {
926 return _PyUnicode_COMPACT_DATA(unicode);
927}
928void *_PyUnicode_data(void *unicode){
929 printf("obj %p\n", unicode);
930 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
931 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
932 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
933 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
934 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
935 return PyUnicode_DATA(unicode);
936}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200937
938void
939_PyUnicode_Dump(PyObject *op)
940{
941 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200942 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
943 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
944 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200945
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200947 {
948 if (ascii->state.ascii)
949 data = (ascii + 1);
950 else
951 data = (compact + 1);
952 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200953 else
954 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200955 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
956
Victor Stinnera849a4b2011-10-03 12:12:11 +0200957 if (ascii->wstr == data)
958 printf("shared ");
959 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200960
Victor Stinnera3b334d2011-10-03 13:53:37 +0200961 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200962 printf(" (%zu), ", compact->wstr_length);
963 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
964 printf("shared ");
965 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200966 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200967 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200968}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969#endif
970
971PyObject *
972PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
973{
974 PyObject *obj;
975 PyCompactUnicodeObject *unicode;
976 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200977 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200978 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979 Py_ssize_t char_size;
980 Py_ssize_t struct_size;
981
982 /* Optimization for empty strings */
983 if (size == 0 && unicode_empty != NULL) {
984 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200985 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200986 }
987
988#ifdef Py_DEBUG
989 ++unicode_new_new_calls;
990#endif
991
Victor Stinner9e9d6892011-10-04 01:02:02 +0200992 is_ascii = 0;
993 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200994 struct_size = sizeof(PyCompactUnicodeObject);
995 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200996 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200997 char_size = 1;
998 is_ascii = 1;
999 struct_size = sizeof(PyASCIIObject);
1000 }
1001 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001002 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001003 char_size = 1;
1004 }
1005 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001006 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001007 char_size = 2;
1008 if (sizeof(wchar_t) == 2)
1009 is_sharing = 1;
1010 }
1011 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001012 if (maxchar > MAX_UNICODE) {
1013 PyErr_SetString(PyExc_SystemError,
1014 "invalid maximum character passed to PyUnicode_New");
1015 return NULL;
1016 }
Victor Stinner8f825062012-04-27 13:55:39 +02001017 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001018 char_size = 4;
1019 if (sizeof(wchar_t) == 4)
1020 is_sharing = 1;
1021 }
1022
1023 /* Ensure we won't overflow the size. */
1024 if (size < 0) {
1025 PyErr_SetString(PyExc_SystemError,
1026 "Negative size passed to PyUnicode_New");
1027 return NULL;
1028 }
1029 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1030 return PyErr_NoMemory();
1031
1032 /* Duplicated allocation code from _PyObject_New() instead of a call to
1033 * PyObject_New() so we are able to allocate space for the object and
1034 * it's data buffer.
1035 */
1036 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1037 if (obj == NULL)
1038 return PyErr_NoMemory();
1039 obj = PyObject_INIT(obj, &PyUnicode_Type);
1040 if (obj == NULL)
1041 return NULL;
1042
1043 unicode = (PyCompactUnicodeObject *)obj;
1044 if (is_ascii)
1045 data = ((PyASCIIObject*)obj) + 1;
1046 else
1047 data = unicode + 1;
1048 _PyUnicode_LENGTH(unicode) = size;
1049 _PyUnicode_HASH(unicode) = -1;
1050 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001051 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 _PyUnicode_STATE(unicode).compact = 1;
1053 _PyUnicode_STATE(unicode).ready = 1;
1054 _PyUnicode_STATE(unicode).ascii = is_ascii;
1055 if (is_ascii) {
1056 ((char*)data)[size] = 0;
1057 _PyUnicode_WSTR(unicode) = NULL;
1058 }
Victor Stinner8f825062012-04-27 13:55:39 +02001059 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 ((char*)data)[size] = 0;
1061 _PyUnicode_WSTR(unicode) = NULL;
1062 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001064 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001065 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066 else {
1067 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001068 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001069 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001071 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 ((Py_UCS4*)data)[size] = 0;
1073 if (is_sharing) {
1074 _PyUnicode_WSTR_LENGTH(unicode) = size;
1075 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1076 }
1077 else {
1078 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1079 _PyUnicode_WSTR(unicode) = NULL;
1080 }
1081 }
Victor Stinner8f825062012-04-27 13:55:39 +02001082#ifdef Py_DEBUG
1083 /* Fill the data with invalid characters to detect bugs earlier.
1084 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1085 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1086 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1087 memset(data, 0xff, size * kind);
1088#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001089 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 return obj;
1091}
1092
1093#if SIZEOF_WCHAR_T == 2
1094/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1095 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001096 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097
1098 This function assumes that unicode can hold one more code point than wstr
1099 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001100static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001102 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001103{
1104 const wchar_t *iter;
1105 Py_UCS4 *ucs4_out;
1106
Victor Stinner910337b2011-10-03 03:20:16 +02001107 assert(unicode != NULL);
1108 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1110 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1111
1112 for (iter = begin; iter < end; ) {
1113 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1114 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001115 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1116 && (iter+1) < end
1117 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 {
Victor Stinner551ac952011-11-29 22:58:13 +01001119 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 iter += 2;
1121 }
1122 else {
1123 *ucs4_out++ = *iter;
1124 iter++;
1125 }
1126 }
1127 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1128 _PyUnicode_GET_LENGTH(unicode)));
1129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130}
1131#endif
1132
Victor Stinnercd9950f2011-10-02 00:34:53 +02001133static int
Victor Stinner488fa492011-12-12 00:01:39 +01001134unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001135{
Victor Stinner488fa492011-12-12 00:01:39 +01001136 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001137 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001138 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001139 return -1;
1140 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001141 return 0;
1142}
1143
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001144static int
1145_copy_characters(PyObject *to, Py_ssize_t to_start,
1146 PyObject *from, Py_ssize_t from_start,
1147 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001149 unsigned int from_kind, to_kind;
1150 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001151 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152
Victor Stinneree4544c2012-05-09 22:24:08 +02001153 assert(0 <= how_many);
1154 assert(0 <= from_start);
1155 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001156 assert(PyUnicode_Check(from));
1157 assert(PyUnicode_Check(to));
1158 assert(PyUnicode_IS_READY(from));
1159 assert(PyUnicode_IS_READY(to));
Victor Stinneree4544c2012-05-09 22:24:08 +02001160 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001161 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001163 if (how_many == 0)
1164 return 0;
1165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001167 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001169 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001170
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001171#ifdef Py_DEBUG
1172 if (!check_maxchar
1173 && (from_kind > to_kind
1174 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001175 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001176 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1177 Py_UCS4 ch;
1178 Py_ssize_t i;
1179 for (i=0; i < how_many; i++) {
1180 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1181 assert(ch <= to_maxchar);
1182 }
1183 }
1184#endif
1185 fast = (from_kind == to_kind);
1186 if (check_maxchar
1187 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1188 {
1189 /* deny latin1 => ascii */
1190 fast = 0;
1191 }
1192
1193 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001194 Py_MEMCPY((char*)to_data + to_kind * to_start,
1195 (char*)from_data + from_kind * from_start,
1196 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001197 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001198 else if (from_kind == PyUnicode_1BYTE_KIND
1199 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001200 {
1201 _PyUnicode_CONVERT_BYTES(
1202 Py_UCS1, Py_UCS2,
1203 PyUnicode_1BYTE_DATA(from) + from_start,
1204 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1205 PyUnicode_2BYTE_DATA(to) + to_start
1206 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001207 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001208 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001209 && to_kind == PyUnicode_4BYTE_KIND)
1210 {
1211 _PyUnicode_CONVERT_BYTES(
1212 Py_UCS1, Py_UCS4,
1213 PyUnicode_1BYTE_DATA(from) + from_start,
1214 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1215 PyUnicode_4BYTE_DATA(to) + to_start
1216 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001217 }
1218 else if (from_kind == PyUnicode_2BYTE_KIND
1219 && to_kind == PyUnicode_4BYTE_KIND)
1220 {
1221 _PyUnicode_CONVERT_BYTES(
1222 Py_UCS2, Py_UCS4,
1223 PyUnicode_2BYTE_DATA(from) + from_start,
1224 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1225 PyUnicode_4BYTE_DATA(to) + to_start
1226 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001227 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001228 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001229 /* check if max_char(from substring) <= max_char(to) */
1230 if (from_kind > to_kind
1231 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001232 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001233 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001234 /* slow path to check for character overflow */
1235 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001236 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 Py_ssize_t i;
1238
Victor Stinner56c161a2011-10-06 02:47:11 +02001239#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001240 for (i=0; i < how_many; i++) {
1241 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001242 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001243 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1244 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001245#else
1246 if (!check_maxchar) {
1247 for (i=0; i < how_many; i++) {
1248 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1249 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1250 }
1251 }
1252 else {
1253 for (i=0; i < how_many; i++) {
1254 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1255 if (ch > to_maxchar)
1256 return 1;
1257 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1258 }
1259 }
1260#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001261 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001262 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001263 assert(0 && "inconsistent state");
1264 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001265 }
1266 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001267 return 0;
1268}
1269
1270static void
1271copy_characters(PyObject *to, Py_ssize_t to_start,
1272 PyObject *from, Py_ssize_t from_start,
1273 Py_ssize_t how_many)
1274{
1275 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1276}
1277
1278Py_ssize_t
1279PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1280 PyObject *from, Py_ssize_t from_start,
1281 Py_ssize_t how_many)
1282{
1283 int err;
1284
1285 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1286 PyErr_BadInternalCall();
1287 return -1;
1288 }
1289
Benjamin Petersonbac79492012-01-14 13:34:47 -05001290 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001291 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001292 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001293 return -1;
1294
1295 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1296 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1297 PyErr_Format(PyExc_SystemError,
1298 "Cannot write %zi characters at %zi "
1299 "in a string of %zi characters",
1300 how_many, to_start, PyUnicode_GET_LENGTH(to));
1301 return -1;
1302 }
1303
1304 if (how_many == 0)
1305 return 0;
1306
Victor Stinner488fa492011-12-12 00:01:39 +01001307 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001308 return -1;
1309
1310 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1311 if (err) {
1312 PyErr_Format(PyExc_SystemError,
1313 "Cannot copy %s characters "
1314 "into a string of %s characters",
1315 unicode_kind_name(from),
1316 unicode_kind_name(to));
1317 return -1;
1318 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001319 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320}
1321
Victor Stinner17222162011-09-28 22:15:37 +02001322/* Find the maximum code point and count the number of surrogate pairs so a
1323 correct string length can be computed before converting a string to UCS4.
1324 This function counts single surrogates as a character and not as a pair.
1325
1326 Return 0 on success, or -1 on error. */
1327static int
1328find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1329 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330{
1331 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001332 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333
Victor Stinnerc53be962011-10-02 21:33:54 +02001334 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001335 *num_surrogates = 0;
1336 *maxchar = 0;
1337
1338 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001340 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1341 && (iter+1) < end
1342 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001344 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 iter += 2;
1347 }
1348 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001350 {
1351 ch = *iter;
1352 iter++;
1353 }
1354 if (ch > *maxchar) {
1355 *maxchar = ch;
1356 if (*maxchar > MAX_UNICODE) {
1357 PyErr_Format(PyExc_ValueError,
1358 "character U+%x is not in range [U+0000; U+10ffff]",
1359 ch);
1360 return -1;
1361 }
1362 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 }
1364 return 0;
1365}
1366
1367#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001368static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369#endif
1370
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001371int
1372_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373{
1374 wchar_t *end;
1375 Py_UCS4 maxchar = 0;
1376 Py_ssize_t num_surrogates;
1377#if SIZEOF_WCHAR_T == 2
1378 Py_ssize_t length_wo_surrogates;
1379#endif
1380
Georg Brandl7597add2011-10-05 16:36:47 +02001381 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001382 strings were created using _PyObject_New() and where no canonical
1383 representation (the str field) has been set yet aka strings
1384 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001385 assert(_PyUnicode_CHECK(unicode));
1386 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001388 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001389 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001390 /* Actually, it should neither be interned nor be anything else: */
1391 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392
1393#ifdef Py_DEBUG
1394 ++unicode_ready_calls;
1395#endif
1396
1397 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001398 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001399 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401
1402 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001403 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1404 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 PyErr_NoMemory();
1406 return -1;
1407 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001408 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 _PyUnicode_WSTR(unicode), end,
1410 PyUnicode_1BYTE_DATA(unicode));
1411 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1412 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1413 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1414 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001415 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001416 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001417 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418 }
1419 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001420 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001421 _PyUnicode_UTF8(unicode) = NULL;
1422 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001423 }
1424 PyObject_FREE(_PyUnicode_WSTR(unicode));
1425 _PyUnicode_WSTR(unicode) = NULL;
1426 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1427 }
1428 /* In this case we might have to convert down from 4-byte native
1429 wchar_t to 2-byte unicode. */
1430 else if (maxchar < 65536) {
1431 assert(num_surrogates == 0 &&
1432 "FindMaxCharAndNumSurrogatePairs() messed up");
1433
Victor Stinner506f5922011-09-28 22:34:18 +02001434#if SIZEOF_WCHAR_T == 2
1435 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001436 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001437 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1438 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1439 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001440 _PyUnicode_UTF8(unicode) = NULL;
1441 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001442#else
1443 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001444 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001445 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001446 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001447 PyErr_NoMemory();
1448 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449 }
Victor Stinner506f5922011-09-28 22:34:18 +02001450 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1451 _PyUnicode_WSTR(unicode), end,
1452 PyUnicode_2BYTE_DATA(unicode));
1453 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1454 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1455 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001456 _PyUnicode_UTF8(unicode) = NULL;
1457 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001458 PyObject_FREE(_PyUnicode_WSTR(unicode));
1459 _PyUnicode_WSTR(unicode) = NULL;
1460 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1461#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 }
1463 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1464 else {
1465#if SIZEOF_WCHAR_T == 2
1466 /* in case the native representation is 2-bytes, we need to allocate a
1467 new normalized 4-byte version. */
1468 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001469 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1470 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 PyErr_NoMemory();
1472 return -1;
1473 }
1474 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1475 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001476 _PyUnicode_UTF8(unicode) = NULL;
1477 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001478 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1479 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001480 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 PyObject_FREE(_PyUnicode_WSTR(unicode));
1482 _PyUnicode_WSTR(unicode) = NULL;
1483 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1484#else
1485 assert(num_surrogates == 0);
1486
Victor Stinnerc3c74152011-10-02 20:39:55 +02001487 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001489 _PyUnicode_UTF8(unicode) = NULL;
1490 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001491 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1492#endif
1493 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1494 }
1495 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001496 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497 return 0;
1498}
1499
Alexander Belopolsky40018472011-02-26 01:02:56 +00001500static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001501unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502{
Walter Dörwald16807132007-05-25 13:52:07 +00001503 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001504 case SSTATE_NOT_INTERNED:
1505 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001506
Benjamin Peterson29060642009-01-31 22:14:21 +00001507 case SSTATE_INTERNED_MORTAL:
1508 /* revive dead object temporarily for DelItem */
1509 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001510 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001511 Py_FatalError(
1512 "deletion of interned string failed");
1513 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001514
Benjamin Peterson29060642009-01-31 22:14:21 +00001515 case SSTATE_INTERNED_IMMORTAL:
1516 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001517
Benjamin Peterson29060642009-01-31 22:14:21 +00001518 default:
1519 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001520 }
1521
Victor Stinner03490912011-10-03 23:45:12 +02001522 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001523 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001524 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001525 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001526 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1527 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001528
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001529 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001530}
1531
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001532#ifdef Py_DEBUG
1533static int
1534unicode_is_singleton(PyObject *unicode)
1535{
1536 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1537 if (unicode == unicode_empty)
1538 return 1;
1539 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1540 {
1541 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1542 if (ch < 256 && unicode_latin1[ch] == unicode)
1543 return 1;
1544 }
1545 return 0;
1546}
1547#endif
1548
Alexander Belopolsky40018472011-02-26 01:02:56 +00001549static int
Victor Stinner488fa492011-12-12 00:01:39 +01001550unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001551{
Victor Stinner488fa492011-12-12 00:01:39 +01001552 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001553 if (Py_REFCNT(unicode) != 1)
1554 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001555 if (_PyUnicode_HASH(unicode) != -1)
1556 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001557 if (PyUnicode_CHECK_INTERNED(unicode))
1558 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001559 if (!PyUnicode_CheckExact(unicode))
1560 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001561#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001562 /* singleton refcount is greater than 1 */
1563 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001564#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001565 return 1;
1566}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001567
Victor Stinnerfe226c02011-10-03 03:52:20 +02001568static int
1569unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1570{
1571 PyObject *unicode;
1572 Py_ssize_t old_length;
1573
1574 assert(p_unicode != NULL);
1575 unicode = *p_unicode;
1576
1577 assert(unicode != NULL);
1578 assert(PyUnicode_Check(unicode));
1579 assert(0 <= length);
1580
Victor Stinner910337b2011-10-03 03:20:16 +02001581 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001582 old_length = PyUnicode_WSTR_LENGTH(unicode);
1583 else
1584 old_length = PyUnicode_GET_LENGTH(unicode);
1585 if (old_length == length)
1586 return 0;
1587
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001588 if (length == 0) {
1589 Py_DECREF(*p_unicode);
1590 *p_unicode = unicode_empty;
1591 Py_INCREF(*p_unicode);
1592 return 0;
1593 }
1594
Victor Stinner488fa492011-12-12 00:01:39 +01001595 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001596 PyObject *copy = resize_copy(unicode, length);
1597 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001598 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001599 Py_DECREF(*p_unicode);
1600 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001601 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001602 }
1603
Victor Stinnerfe226c02011-10-03 03:52:20 +02001604 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001605 PyObject *new_unicode = resize_compact(unicode, length);
1606 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001607 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001608 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001609 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001610 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001611 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001612}
1613
Alexander Belopolsky40018472011-02-26 01:02:56 +00001614int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001616{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001617 PyObject *unicode;
1618 if (p_unicode == NULL) {
1619 PyErr_BadInternalCall();
1620 return -1;
1621 }
1622 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001623 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 {
1625 PyErr_BadInternalCall();
1626 return -1;
1627 }
1628 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001629}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001630
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001631static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001632unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1633 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001634{
1635 PyObject *result;
1636 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001637 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001638 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1639 return 0;
1640 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1641 maxchar);
1642 if (result == NULL)
1643 return -1;
Victor Stinner1b487b42012-05-03 12:29:04 +02001644 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001645 Py_DECREF(*p_unicode);
1646 *p_unicode = result;
1647 return 0;
1648}
1649
1650static int
1651unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1652 Py_UCS4 ch)
1653{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001654 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001655 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001656 return -1;
1657 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1658 PyUnicode_DATA(*p_unicode),
1659 (*pos)++, ch);
1660 return 0;
1661}
1662
Victor Stinnerc5166102012-02-22 13:55:02 +01001663/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1664 Return the length of the input string.
1665
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001666 WARNING: The function doesn't copy the terminating null character and
1667 doesn't check the maximum character (may write a latin1 character in an
1668 ASCII string). */
Victor Stinnerc5166102012-02-22 13:55:02 +01001669static Py_ssize_t
1670unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1671{
1672 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1673 void *data = PyUnicode_DATA(unicode);
1674
1675 switch (kind) {
1676 case PyUnicode_1BYTE_KIND: {
1677 Py_ssize_t len = strlen(str);
1678 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001679 memcpy((char *) data + index, str, len);
Victor Stinnerc5166102012-02-22 13:55:02 +01001680 return len;
1681 }
1682 case PyUnicode_2BYTE_KIND: {
1683 Py_UCS2 *start = (Py_UCS2 *)data + index;
1684 Py_UCS2 *ucs2 = start;
1685 assert(index <= PyUnicode_GET_LENGTH(unicode));
1686
1687 for (; *str; ++ucs2, ++str)
1688 *ucs2 = (Py_UCS2)*str;
1689
1690 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1691 return ucs2 - start;
1692 }
1693 default: {
1694 Py_UCS4 *start = (Py_UCS4 *)data + index;
1695 Py_UCS4 *ucs4 = start;
1696 assert(kind == PyUnicode_4BYTE_KIND);
1697 assert(index <= PyUnicode_GET_LENGTH(unicode));
1698
1699 for (; *str; ++ucs4, ++str)
1700 *ucs4 = (Py_UCS4)*str;
1701
1702 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1703 return ucs4 - start;
1704 }
1705 }
1706}
1707
1708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709static PyObject*
1710get_latin1_char(unsigned char ch)
1711{
Victor Stinnera464fc12011-10-02 20:39:30 +02001712 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001713 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001714 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001715 if (!unicode)
1716 return NULL;
1717 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001718 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001719 unicode_latin1[ch] = unicode;
1720 }
1721 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001722 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001723}
1724
Alexander Belopolsky40018472011-02-26 01:02:56 +00001725PyObject *
1726PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001728 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 Py_UCS4 maxchar = 0;
1730 Py_ssize_t num_surrogates;
1731
1732 if (u == NULL)
1733 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001735 /* If the Unicode data is known at construction time, we can apply
1736 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 /* Optimization for empty strings */
1739 if (size == 0 && unicode_empty != NULL) {
1740 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001741 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001742 }
Tim Petersced69f82003-09-16 20:30:58 +00001743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 /* Single character Unicode objects in the Latin-1 range are
1745 shared when using this constructor */
1746 if (size == 1 && *u < 256)
1747 return get_latin1_char((unsigned char)*u);
1748
1749 /* If not empty and not single character, copy the Unicode data
1750 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001751 if (find_maxchar_surrogates(u, u + size,
1752 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 return NULL;
1754
Victor Stinner8faf8212011-12-08 22:14:11 +01001755 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756 if (!unicode)
1757 return NULL;
1758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759 switch (PyUnicode_KIND(unicode)) {
1760 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001761 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1763 break;
1764 case PyUnicode_2BYTE_KIND:
1765#if Py_UNICODE_SIZE == 2
1766 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1767#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001768 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1770#endif
1771 break;
1772 case PyUnicode_4BYTE_KIND:
1773#if SIZEOF_WCHAR_T == 2
1774 /* This is the only case which has to process surrogates, thus
1775 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001776 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777#else
1778 assert(num_surrogates == 0);
1779 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1780#endif
1781 break;
1782 default:
1783 assert(0 && "Impossible state");
1784 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001786 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787}
1788
Alexander Belopolsky40018472011-02-26 01:02:56 +00001789PyObject *
1790PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001791{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001792 if (size < 0) {
1793 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001794 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001795 return NULL;
1796 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001797 if (u != NULL)
1798 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1799 else
1800 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001801}
1802
Alexander Belopolsky40018472011-02-26 01:02:56 +00001803PyObject *
1804PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001805{
1806 size_t size = strlen(u);
1807 if (size > PY_SSIZE_T_MAX) {
1808 PyErr_SetString(PyExc_OverflowError, "input too long");
1809 return NULL;
1810 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001811 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001812}
1813
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001814PyObject *
1815_PyUnicode_FromId(_Py_Identifier *id)
1816{
1817 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001818 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1819 strlen(id->string),
1820 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001821 if (!id->object)
1822 return NULL;
1823 PyUnicode_InternInPlace(&id->object);
1824 assert(!id->next);
1825 id->next = static_strings;
1826 static_strings = id;
1827 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001828 return id->object;
1829}
1830
1831void
1832_PyUnicode_ClearStaticStrings()
1833{
1834 _Py_Identifier *i;
1835 for (i = static_strings; i; i = i->next) {
1836 Py_DECREF(i->object);
1837 i->object = NULL;
1838 i->next = NULL;
1839 }
1840}
1841
Benjamin Peterson0df54292012-03-26 14:50:32 -04001842/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001843
Victor Stinnere57b1c02011-09-28 22:20:48 +02001844static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001845unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001846{
Victor Stinner785938e2011-12-11 20:09:03 +01001847 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001848 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001849#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001850 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001851#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001852 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001853 }
Victor Stinner785938e2011-12-11 20:09:03 +01001854 unicode = PyUnicode_New(size, 127);
1855 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001856 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001857 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1858 assert(_PyUnicode_CheckConsistency(unicode, 1));
1859 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001860}
1861
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001862static Py_UCS4
1863kind_maxchar_limit(unsigned int kind)
1864{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001865 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001866 case PyUnicode_1BYTE_KIND:
1867 return 0x80;
1868 case PyUnicode_2BYTE_KIND:
1869 return 0x100;
1870 case PyUnicode_4BYTE_KIND:
1871 return 0x10000;
1872 default:
1873 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001874 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001875 }
1876}
1877
Victor Stinnere6abb482012-05-02 01:15:40 +02001878Py_LOCAL_INLINE(Py_UCS4)
1879align_maxchar(Py_UCS4 maxchar)
1880{
1881 if (maxchar <= 127)
1882 return 127;
1883 else if (maxchar <= 255)
1884 return 255;
1885 else if (maxchar <= 65535)
1886 return 65535;
1887 else
1888 return MAX_UNICODE;
1889}
1890
Victor Stinner702c7342011-10-05 13:50:52 +02001891static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001892_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001893{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001894 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001895 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001896
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001897 if (size == 0) {
1898 Py_INCREF(unicode_empty);
1899 return unicode_empty;
1900 }
1901 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001902 if (size == 1)
1903 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001904
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001905 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001906 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 if (!res)
1908 return NULL;
1909 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001910 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001911 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001912}
1913
Victor Stinnere57b1c02011-09-28 22:20:48 +02001914static PyObject*
1915_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001916{
1917 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001918 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001919
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001920 if (size == 0) {
1921 Py_INCREF(unicode_empty);
1922 return unicode_empty;
1923 }
1924 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001925 if (size == 1) {
1926 Py_UCS4 ch = u[0];
1927 if (ch < 256)
1928 return get_latin1_char((unsigned char)ch);
1929
1930 res = PyUnicode_New(1, ch);
1931 if (res == NULL)
1932 return NULL;
1933 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1934 assert(_PyUnicode_CheckConsistency(res, 1));
1935 return res;
1936 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001937
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001938 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001939 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940 if (!res)
1941 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001942 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001944 else {
1945 _PyUnicode_CONVERT_BYTES(
1946 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1947 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001948 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001949 return res;
1950}
1951
Victor Stinnere57b1c02011-09-28 22:20:48 +02001952static PyObject*
1953_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954{
1955 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001956 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001957
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001958 if (size == 0) {
1959 Py_INCREF(unicode_empty);
1960 return unicode_empty;
1961 }
1962 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001963 if (size == 1) {
1964 Py_UCS4 ch = u[0];
1965 if (ch < 256)
1966 return get_latin1_char((unsigned char)ch);
1967
1968 res = PyUnicode_New(1, ch);
1969 if (res == NULL)
1970 return NULL;
1971 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1972 assert(_PyUnicode_CheckConsistency(res, 1));
1973 return res;
1974 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001975
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001976 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001977 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978 if (!res)
1979 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001980 if (max_char < 256)
1981 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1982 PyUnicode_1BYTE_DATA(res));
1983 else if (max_char < 0x10000)
1984 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1985 PyUnicode_2BYTE_DATA(res));
1986 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001987 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001988 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001989 return res;
1990}
1991
1992PyObject*
1993PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1994{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001995 if (size < 0) {
1996 PyErr_SetString(PyExc_ValueError, "size must be positive");
1997 return NULL;
1998 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001999 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002001 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002003 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002004 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002005 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002006 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002007 PyErr_SetString(PyExc_SystemError, "invalid kind");
2008 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010}
2011
Victor Stinnerece58de2012-04-23 23:36:38 +02002012Py_UCS4
2013_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2014{
2015 enum PyUnicode_Kind kind;
2016 void *startptr, *endptr;
2017
2018 assert(PyUnicode_IS_READY(unicode));
2019 assert(0 <= start);
2020 assert(end <= PyUnicode_GET_LENGTH(unicode));
2021 assert(start <= end);
2022
2023 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2024 return PyUnicode_MAX_CHAR_VALUE(unicode);
2025
2026 if (start == end)
2027 return 127;
2028
Victor Stinner94d558b2012-04-27 22:26:58 +02002029 if (PyUnicode_IS_ASCII(unicode))
2030 return 127;
2031
Victor Stinnerece58de2012-04-23 23:36:38 +02002032 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002033 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002034 endptr = (char *)startptr + end * kind;
2035 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002036 switch(kind) {
2037 case PyUnicode_1BYTE_KIND:
2038 return ucs1lib_find_max_char(startptr, endptr);
2039 case PyUnicode_2BYTE_KIND:
2040 return ucs2lib_find_max_char(startptr, endptr);
2041 case PyUnicode_4BYTE_KIND:
2042 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002043 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002044 assert(0);
2045 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002046 }
2047}
2048
Victor Stinner25a4b292011-10-06 12:31:55 +02002049/* Ensure that a string uses the most efficient storage, if it is not the
2050 case: create a new string with of the right kind. Write NULL into *p_unicode
2051 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002052static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002053unicode_adjust_maxchar(PyObject **p_unicode)
2054{
2055 PyObject *unicode, *copy;
2056 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002057 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002058 unsigned int kind;
2059
2060 assert(p_unicode != NULL);
2061 unicode = *p_unicode;
2062 assert(PyUnicode_IS_READY(unicode));
2063 if (PyUnicode_IS_ASCII(unicode))
2064 return;
2065
2066 len = PyUnicode_GET_LENGTH(unicode);
2067 kind = PyUnicode_KIND(unicode);
2068 if (kind == PyUnicode_1BYTE_KIND) {
2069 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002070 max_char = ucs1lib_find_max_char(u, u + len);
2071 if (max_char >= 128)
2072 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002073 }
2074 else if (kind == PyUnicode_2BYTE_KIND) {
2075 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002076 max_char = ucs2lib_find_max_char(u, u + len);
2077 if (max_char >= 256)
2078 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002079 }
2080 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002081 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002082 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002083 max_char = ucs4lib_find_max_char(u, u + len);
2084 if (max_char >= 0x10000)
2085 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002086 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002087 copy = PyUnicode_New(len, max_char);
2088 copy_characters(copy, 0, unicode, 0, len);
2089 Py_DECREF(unicode);
2090 *p_unicode = copy;
2091}
2092
Victor Stinner034f6cf2011-09-30 02:26:44 +02002093PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002094_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002095{
Victor Stinner87af4f22011-11-21 23:03:47 +01002096 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002097 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002098
Victor Stinner034f6cf2011-09-30 02:26:44 +02002099 if (!PyUnicode_Check(unicode)) {
2100 PyErr_BadInternalCall();
2101 return NULL;
2102 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002103 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002104 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002105
Victor Stinner87af4f22011-11-21 23:03:47 +01002106 length = PyUnicode_GET_LENGTH(unicode);
2107 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002108 if (!copy)
2109 return NULL;
2110 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2111
Victor Stinner87af4f22011-11-21 23:03:47 +01002112 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2113 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002114 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002115 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002116}
2117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002118
Victor Stinnerbc603d12011-10-02 01:00:40 +02002119/* Widen Unicode objects to larger buffers. Don't write terminating null
2120 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002121
2122void*
2123_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2124{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002125 Py_ssize_t len;
2126 void *result;
2127 unsigned int skind;
2128
Benjamin Petersonbac79492012-01-14 13:34:47 -05002129 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002130 return NULL;
2131
2132 len = PyUnicode_GET_LENGTH(s);
2133 skind = PyUnicode_KIND(s);
2134 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002135 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002136 return NULL;
2137 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002138 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002139 case PyUnicode_2BYTE_KIND:
2140 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2141 if (!result)
2142 return PyErr_NoMemory();
2143 assert(skind == PyUnicode_1BYTE_KIND);
2144 _PyUnicode_CONVERT_BYTES(
2145 Py_UCS1, Py_UCS2,
2146 PyUnicode_1BYTE_DATA(s),
2147 PyUnicode_1BYTE_DATA(s) + len,
2148 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002150 case PyUnicode_4BYTE_KIND:
2151 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2152 if (!result)
2153 return PyErr_NoMemory();
2154 if (skind == PyUnicode_2BYTE_KIND) {
2155 _PyUnicode_CONVERT_BYTES(
2156 Py_UCS2, Py_UCS4,
2157 PyUnicode_2BYTE_DATA(s),
2158 PyUnicode_2BYTE_DATA(s) + len,
2159 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002160 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002161 else {
2162 assert(skind == PyUnicode_1BYTE_KIND);
2163 _PyUnicode_CONVERT_BYTES(
2164 Py_UCS1, Py_UCS4,
2165 PyUnicode_1BYTE_DATA(s),
2166 PyUnicode_1BYTE_DATA(s) + len,
2167 result);
2168 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002170 default:
2171 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 }
Victor Stinner01698042011-10-04 00:04:26 +02002173 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174 return NULL;
2175}
2176
2177static Py_UCS4*
2178as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2179 int copy_null)
2180{
2181 int kind;
2182 void *data;
2183 Py_ssize_t len, targetlen;
2184 if (PyUnicode_READY(string) == -1)
2185 return NULL;
2186 kind = PyUnicode_KIND(string);
2187 data = PyUnicode_DATA(string);
2188 len = PyUnicode_GET_LENGTH(string);
2189 targetlen = len;
2190 if (copy_null)
2191 targetlen++;
2192 if (!target) {
2193 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2194 PyErr_NoMemory();
2195 return NULL;
2196 }
2197 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2198 if (!target) {
2199 PyErr_NoMemory();
2200 return NULL;
2201 }
2202 }
2203 else {
2204 if (targetsize < targetlen) {
2205 PyErr_Format(PyExc_SystemError,
2206 "string is longer than the buffer");
2207 if (copy_null && 0 < targetsize)
2208 target[0] = 0;
2209 return NULL;
2210 }
2211 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002212 if (kind == PyUnicode_1BYTE_KIND) {
2213 Py_UCS1 *start = (Py_UCS1 *) data;
2214 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002216 else if (kind == PyUnicode_2BYTE_KIND) {
2217 Py_UCS2 *start = (Py_UCS2 *) data;
2218 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2219 }
2220 else {
2221 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002223 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224 if (copy_null)
2225 target[len] = 0;
2226 return target;
2227}
2228
2229Py_UCS4*
2230PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2231 int copy_null)
2232{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002233 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 PyErr_BadInternalCall();
2235 return NULL;
2236 }
2237 return as_ucs4(string, target, targetsize, copy_null);
2238}
2239
2240Py_UCS4*
2241PyUnicode_AsUCS4Copy(PyObject *string)
2242{
2243 return as_ucs4(string, NULL, 0, 1);
2244}
2245
2246#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002247
Alexander Belopolsky40018472011-02-26 01:02:56 +00002248PyObject *
2249PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002252 if (size == 0) {
2253 Py_INCREF(unicode_empty);
2254 return unicode_empty;
2255 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002256 PyErr_BadInternalCall();
2257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258 }
2259
Martin v. Löwis790465f2008-04-05 20:41:37 +00002260 if (size == -1) {
2261 size = wcslen(w);
2262 }
2263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265}
2266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002268
Walter Dörwald346737f2007-05-31 10:44:43 +00002269static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002270makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2271 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002272{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002273 *fmt++ = '%';
2274 if (width) {
2275 if (zeropad)
2276 *fmt++ = '0';
2277 fmt += sprintf(fmt, "%d", width);
2278 }
2279 if (precision)
2280 fmt += sprintf(fmt, ".%d", precision);
2281 if (longflag)
2282 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002283 else if (longlongflag) {
2284 /* longlongflag should only ever be nonzero on machines with
2285 HAVE_LONG_LONG defined */
2286#ifdef HAVE_LONG_LONG
2287 char *f = PY_FORMAT_LONG_LONG;
2288 while (*f)
2289 *fmt++ = *f++;
2290#else
2291 /* we shouldn't ever get here */
2292 assert(0);
2293 *fmt++ = 'l';
2294#endif
2295 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002296 else if (size_tflag) {
2297 char *f = PY_FORMAT_SIZE_T;
2298 while (*f)
2299 *fmt++ = *f++;
2300 }
2301 *fmt++ = c;
2302 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002303}
2304
Victor Stinner96865452011-03-01 23:44:09 +00002305/* helper for PyUnicode_FromFormatV() */
2306
2307static const char*
2308parse_format_flags(const char *f,
2309 int *p_width, int *p_precision,
2310 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2311{
2312 int width, precision, longflag, longlongflag, size_tflag;
2313
2314 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2315 f++;
2316 width = 0;
2317 while (Py_ISDIGIT((unsigned)*f))
2318 width = (width*10) + *f++ - '0';
2319 precision = 0;
2320 if (*f == '.') {
2321 f++;
2322 while (Py_ISDIGIT((unsigned)*f))
2323 precision = (precision*10) + *f++ - '0';
2324 if (*f == '%') {
2325 /* "%.3%s" => f points to "3" */
2326 f--;
2327 }
2328 }
2329 if (*f == '\0') {
2330 /* bogus format "%.1" => go backward, f points to "1" */
2331 f--;
2332 }
2333 if (p_width != NULL)
2334 *p_width = width;
2335 if (p_precision != NULL)
2336 *p_precision = precision;
2337
2338 /* Handle %ld, %lu, %lld and %llu. */
2339 longflag = 0;
2340 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002341 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002342
2343 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002344 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002345 longflag = 1;
2346 ++f;
2347 }
2348#ifdef HAVE_LONG_LONG
2349 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002350 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002351 longlongflag = 1;
2352 f += 2;
2353 }
2354#endif
2355 }
2356 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002357 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002358 size_tflag = 1;
2359 ++f;
2360 }
2361 if (p_longflag != NULL)
2362 *p_longflag = longflag;
2363 if (p_longlongflag != NULL)
2364 *p_longlongflag = longlongflag;
2365 if (p_size_tflag != NULL)
2366 *p_size_tflag = size_tflag;
2367 return f;
2368}
2369
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002370/* maximum number of characters required for output of %ld. 21 characters
2371 allows for 64-bit integers (in decimal) and an optional sign. */
2372#define MAX_LONG_CHARS 21
2373/* maximum number of characters required for output of %lld.
2374 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2375 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2376#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2377
Walter Dörwaldd2034312007-05-18 16:29:38 +00002378PyObject *
2379PyUnicode_FromFormatV(const char *format, va_list vargs)
2380{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002381 va_list count;
2382 Py_ssize_t callcount = 0;
2383 PyObject **callresults = NULL;
2384 PyObject **callresult = NULL;
2385 Py_ssize_t n = 0;
2386 int width = 0;
2387 int precision = 0;
2388 int zeropad;
2389 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002390 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002391 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002392 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2394 Py_UCS4 argmaxchar;
2395 Py_ssize_t numbersize = 0;
2396 char *numberresults = NULL;
2397 char *numberresult = NULL;
2398 Py_ssize_t i;
2399 int kind;
2400 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002401
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002402 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002403 /* step 1: count the number of %S/%R/%A/%s format specifications
2404 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2405 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002407 * also estimate a upper bound for all the number formats in the string,
2408 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002409 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002410 for (f = format; *f; f++) {
2411 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002412 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2414 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2415 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2416 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002418 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002419#ifdef HAVE_LONG_LONG
2420 if (longlongflag) {
2421 if (width < MAX_LONG_LONG_CHARS)
2422 width = MAX_LONG_LONG_CHARS;
2423 }
2424 else
2425#endif
2426 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2427 including sign. Decimal takes the most space. This
2428 isn't enough for octal. If a width is specified we
2429 need more (which we allocate later). */
2430 if (width < MAX_LONG_CHARS)
2431 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432
2433 /* account for the size + '\0' to separate numbers
2434 inside of the numberresults buffer */
2435 numbersize += (width + 1);
2436 }
2437 }
2438 else if ((unsigned char)*f > 127) {
2439 PyErr_Format(PyExc_ValueError,
2440 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2441 "string, got a non-ASCII byte: 0x%02x",
2442 (unsigned char)*f);
2443 return NULL;
2444 }
2445 }
2446 /* step 2: allocate memory for the results of
2447 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2448 if (callcount) {
2449 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2450 if (!callresults) {
2451 PyErr_NoMemory();
2452 return NULL;
2453 }
2454 callresult = callresults;
2455 }
2456 /* step 2.5: allocate memory for the results of formating numbers */
2457 if (numbersize) {
2458 numberresults = PyObject_Malloc(numbersize);
2459 if (!numberresults) {
2460 PyErr_NoMemory();
2461 goto fail;
2462 }
2463 numberresult = numberresults;
2464 }
2465
2466 /* step 3: format numbers and figure out how large a buffer we need */
2467 for (f = format; *f; f++) {
2468 if (*f == '%') {
2469 const char* p;
2470 int longflag;
2471 int longlongflag;
2472 int size_tflag;
2473 int numprinted;
2474
2475 p = f;
2476 zeropad = (f[1] == '0');
2477 f = parse_format_flags(f, &width, &precision,
2478 &longflag, &longlongflag, &size_tflag);
2479 switch (*f) {
2480 case 'c':
2481 {
2482 Py_UCS4 ordinal = va_arg(count, int);
Victor Stinnere6abb482012-05-02 01:15:40 +02002483 maxchar = MAX_MAXCHAR(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002484 n++;
2485 break;
2486 }
2487 case '%':
2488 n++;
2489 break;
2490 case 'i':
2491 case 'd':
2492 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2493 width, precision, *f);
2494 if (longflag)
2495 numprinted = sprintf(numberresult, fmt,
2496 va_arg(count, long));
2497#ifdef HAVE_LONG_LONG
2498 else if (longlongflag)
2499 numprinted = sprintf(numberresult, fmt,
2500 va_arg(count, PY_LONG_LONG));
2501#endif
2502 else if (size_tflag)
2503 numprinted = sprintf(numberresult, fmt,
2504 va_arg(count, Py_ssize_t));
2505 else
2506 numprinted = sprintf(numberresult, fmt,
2507 va_arg(count, int));
2508 n += numprinted;
2509 /* advance by +1 to skip over the '\0' */
2510 numberresult += (numprinted + 1);
2511 assert(*(numberresult - 1) == '\0');
2512 assert(*(numberresult - 2) != '\0');
2513 assert(numprinted >= 0);
2514 assert(numberresult <= numberresults + numbersize);
2515 break;
2516 case 'u':
2517 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2518 width, precision, 'u');
2519 if (longflag)
2520 numprinted = sprintf(numberresult, fmt,
2521 va_arg(count, unsigned long));
2522#ifdef HAVE_LONG_LONG
2523 else if (longlongflag)
2524 numprinted = sprintf(numberresult, fmt,
2525 va_arg(count, unsigned PY_LONG_LONG));
2526#endif
2527 else if (size_tflag)
2528 numprinted = sprintf(numberresult, fmt,
2529 va_arg(count, size_t));
2530 else
2531 numprinted = sprintf(numberresult, fmt,
2532 va_arg(count, unsigned int));
2533 n += numprinted;
2534 numberresult += (numprinted + 1);
2535 assert(*(numberresult - 1) == '\0');
2536 assert(*(numberresult - 2) != '\0');
2537 assert(numprinted >= 0);
2538 assert(numberresult <= numberresults + numbersize);
2539 break;
2540 case 'x':
2541 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2542 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2543 n += numprinted;
2544 numberresult += (numprinted + 1);
2545 assert(*(numberresult - 1) == '\0');
2546 assert(*(numberresult - 2) != '\0');
2547 assert(numprinted >= 0);
2548 assert(numberresult <= numberresults + numbersize);
2549 break;
2550 case 'p':
2551 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2552 /* %p is ill-defined: ensure leading 0x. */
2553 if (numberresult[1] == 'X')
2554 numberresult[1] = 'x';
2555 else if (numberresult[1] != 'x') {
2556 memmove(numberresult + 2, numberresult,
2557 strlen(numberresult) + 1);
2558 numberresult[0] = '0';
2559 numberresult[1] = 'x';
2560 numprinted += 2;
2561 }
2562 n += numprinted;
2563 numberresult += (numprinted + 1);
2564 assert(*(numberresult - 1) == '\0');
2565 assert(*(numberresult - 2) != '\0');
2566 assert(numprinted >= 0);
2567 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002568 break;
2569 case 's':
2570 {
2571 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002572 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002573 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002574 if (!str)
2575 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002576 /* since PyUnicode_DecodeUTF8 returns already flexible
2577 unicode objects, there is no need to call ready on them */
2578 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002579 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002580 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002581 /* Remember the str and switch to the next slot */
2582 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 break;
2584 }
2585 case 'U':
2586 {
2587 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002588 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 if (PyUnicode_READY(obj) == -1)
2590 goto fail;
2591 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002592 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002594 break;
2595 }
2596 case 'V':
2597 {
2598 PyObject *obj = va_arg(count, PyObject *);
2599 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002600 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002601 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002602 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002603 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 if (PyUnicode_READY(obj) == -1)
2605 goto fail;
2606 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002607 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002609 *callresult++ = NULL;
2610 }
2611 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002612 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002613 if (!str_obj)
2614 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002615 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002616 Py_DECREF(str_obj);
2617 goto fail;
2618 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002620 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002621 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002622 *callresult++ = str_obj;
2623 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002624 break;
2625 }
2626 case 'S':
2627 {
2628 PyObject *obj = va_arg(count, PyObject *);
2629 PyObject *str;
2630 assert(obj);
2631 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002632 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002633 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002634 if (PyUnicode_READY(str) == -1) {
2635 Py_DECREF(str);
2636 goto fail;
2637 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002638 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002639 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002640 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002641 /* Remember the str and switch to the next slot */
2642 *callresult++ = str;
2643 break;
2644 }
2645 case 'R':
2646 {
2647 PyObject *obj = va_arg(count, PyObject *);
2648 PyObject *repr;
2649 assert(obj);
2650 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002651 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002652 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002653 if (PyUnicode_READY(repr) == -1) {
2654 Py_DECREF(repr);
2655 goto fail;
2656 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Victor Stinnere6abb482012-05-02 01:15:40 +02002658 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002659 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002660 /* Remember the repr and switch to the next slot */
2661 *callresult++ = repr;
2662 break;
2663 }
2664 case 'A':
2665 {
2666 PyObject *obj = va_arg(count, PyObject *);
2667 PyObject *ascii;
2668 assert(obj);
2669 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002670 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002671 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002672 if (PyUnicode_READY(ascii) == -1) {
2673 Py_DECREF(ascii);
2674 goto fail;
2675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002676 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Victor Stinnere6abb482012-05-02 01:15:40 +02002677 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002678 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002679 /* Remember the repr and switch to the next slot */
2680 *callresult++ = ascii;
2681 break;
2682 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002683 default:
2684 /* if we stumble upon an unknown
2685 formatting code, copy the rest of
2686 the format string to the output
2687 string. (we cannot just skip the
2688 code, since there's no way to know
2689 what's in the argument list) */
2690 n += strlen(p);
2691 goto expand;
2692 }
2693 } else
2694 n++;
2695 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002696 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002697 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002698 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002699 we don't have to resize the string.
2700 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002701 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002702 if (!string)
2703 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002704 kind = PyUnicode_KIND(string);
2705 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002709 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002711 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002712
2713 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002714 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2715 /* checking for == because the last argument could be a empty
2716 string, which causes i to point to end, the assert at the end of
2717 the loop */
2718 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002719
Benjamin Peterson14339b62009-01-31 16:36:08 +00002720 switch (*f) {
2721 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002722 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002723 const int ordinal = va_arg(vargs, int);
2724 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002725 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002726 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002727 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002728 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002729 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002730 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002731 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002732 {
2733 Py_ssize_t written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002734 /* unused, since we already have the result */
2735 if (*f == 'p')
2736 (void) va_arg(vargs, void *);
2737 else
2738 (void) va_arg(vargs, int);
2739 /* extract the result from numberresults and append. */
Victor Stinnerc5166102012-02-22 13:55:02 +01002740 written = unicode_write_cstr(string, i, numberresult);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 /* skip over the separating '\0' */
Victor Stinnerc5166102012-02-22 13:55:02 +01002742 i += written;
2743 numberresult += written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002744 assert(*numberresult == '\0');
2745 numberresult++;
2746 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002747 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002748 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002749 case 's':
2750 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002751 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002752 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002753 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002754 size = PyUnicode_GET_LENGTH(*callresult);
2755 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002756 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002757 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002758 /* We're done with the unicode()/repr() => forget it */
2759 Py_DECREF(*callresult);
2760 /* switch to next unicode()/repr() result */
2761 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002762 break;
2763 }
2764 case 'U':
2765 {
2766 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002767 Py_ssize_t size;
2768 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2769 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002770 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002771 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002772 break;
2773 }
2774 case 'V':
2775 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002776 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002777 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002778 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002779 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780 size = PyUnicode_GET_LENGTH(obj);
2781 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002782 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002783 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002785 size = PyUnicode_GET_LENGTH(*callresult);
2786 assert(PyUnicode_KIND(*callresult) <=
2787 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002788 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002789 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002790 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002791 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002792 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002793 break;
2794 }
2795 case 'S':
2796 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002797 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002798 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002799 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002800 /* unused, since we already have the result */
2801 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002802 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002803 copy_characters(string, i, *callresult, 0, size);
2804 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002805 /* We're done with the unicode()/repr() => forget it */
2806 Py_DECREF(*callresult);
2807 /* switch to next unicode()/repr() result */
2808 ++callresult;
2809 break;
2810 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002811 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002812 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002813 break;
2814 default:
Victor Stinnerc5166102012-02-22 13:55:02 +01002815 i += unicode_write_cstr(string, i, p);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002816 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002817 goto end;
2818 }
Victor Stinner1205f272010-09-11 00:54:47 +00002819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002820 else {
2821 assert(i < PyUnicode_GET_LENGTH(string));
2822 PyUnicode_WRITE(kind, data, i++, *f);
2823 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002824 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002825 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002826
Benjamin Peterson29060642009-01-31 22:14:21 +00002827 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002828 if (callresults)
2829 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002830 if (numberresults)
2831 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002832 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002833 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002834 if (callresults) {
2835 PyObject **callresult2 = callresults;
2836 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002837 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002838 ++callresult2;
2839 }
2840 PyObject_Free(callresults);
2841 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002842 if (numberresults)
2843 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002844 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002845}
2846
Walter Dörwaldd2034312007-05-18 16:29:38 +00002847PyObject *
2848PyUnicode_FromFormat(const char *format, ...)
2849{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002850 PyObject* ret;
2851 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002852
2853#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002854 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002855#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002856 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002857#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002858 ret = PyUnicode_FromFormatV(format, vargs);
2859 va_end(vargs);
2860 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002861}
2862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002863#ifdef HAVE_WCHAR_H
2864
Victor Stinner5593d8a2010-10-02 11:11:27 +00002865/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2866 convert a Unicode object to a wide character string.
2867
Victor Stinnerd88d9832011-09-06 02:00:05 +02002868 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002869 character) required to convert the unicode object. Ignore size argument.
2870
Victor Stinnerd88d9832011-09-06 02:00:05 +02002871 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002872 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002873 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002874static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002875unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002876 wchar_t *w,
2877 Py_ssize_t size)
2878{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002879 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002880 const wchar_t *wstr;
2881
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002882 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002883 if (wstr == NULL)
2884 return -1;
2885
Victor Stinner5593d8a2010-10-02 11:11:27 +00002886 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002887 if (size > res)
2888 size = res + 1;
2889 else
2890 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002891 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002892 return res;
2893 }
2894 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002895 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002896}
2897
2898Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002899PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002900 wchar_t *w,
2901 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902{
2903 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002904 PyErr_BadInternalCall();
2905 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002907 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002908}
2909
Victor Stinner137c34c2010-09-29 10:25:54 +00002910wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002911PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002912 Py_ssize_t *size)
2913{
2914 wchar_t* buffer;
2915 Py_ssize_t buflen;
2916
2917 if (unicode == NULL) {
2918 PyErr_BadInternalCall();
2919 return NULL;
2920 }
2921
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002922 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002923 if (buflen == -1)
2924 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002925 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002926 PyErr_NoMemory();
2927 return NULL;
2928 }
2929
Victor Stinner137c34c2010-09-29 10:25:54 +00002930 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2931 if (buffer == NULL) {
2932 PyErr_NoMemory();
2933 return NULL;
2934 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002935 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002936 if (buflen == -1)
2937 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002938 if (size != NULL)
2939 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002940 return buffer;
2941}
2942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002943#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944
Alexander Belopolsky40018472011-02-26 01:02:56 +00002945PyObject *
2946PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002947{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002948 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002949 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002950 PyErr_SetString(PyExc_ValueError,
2951 "chr() arg not in range(0x110000)");
2952 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002953 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002955 if (ordinal < 256)
2956 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002958 v = PyUnicode_New(1, ordinal);
2959 if (v == NULL)
2960 return NULL;
2961 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002962 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002963 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002964}
2965
Alexander Belopolsky40018472011-02-26 01:02:56 +00002966PyObject *
2967PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002968{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002969 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002970 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002971 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002972 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002973 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002974 Py_INCREF(obj);
2975 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002976 }
2977 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 /* For a Unicode subtype that's not a Unicode object,
2979 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002980 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002981 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002982 PyErr_Format(PyExc_TypeError,
2983 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002984 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002985 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002986}
2987
Alexander Belopolsky40018472011-02-26 01:02:56 +00002988PyObject *
2989PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002990 const char *encoding,
2991 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002992{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002993 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002994 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002995
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002997 PyErr_BadInternalCall();
2998 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003000
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003001 /* Decoding bytes objects is the most common case and should be fast */
3002 if (PyBytes_Check(obj)) {
3003 if (PyBytes_GET_SIZE(obj) == 0) {
3004 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003005 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003006 }
3007 else {
3008 v = PyUnicode_Decode(
3009 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3010 encoding, errors);
3011 }
3012 return v;
3013 }
3014
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003015 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003016 PyErr_SetString(PyExc_TypeError,
3017 "decoding str is not supported");
3018 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003019 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003020
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003021 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3022 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3023 PyErr_Format(PyExc_TypeError,
3024 "coercing to str: need bytes, bytearray "
3025 "or buffer-like object, %.80s found",
3026 Py_TYPE(obj)->tp_name);
3027 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003028 }
Tim Petersced69f82003-09-16 20:30:58 +00003029
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003030 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003031 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003032 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003033 }
Tim Petersced69f82003-09-16 20:30:58 +00003034 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003035 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003036
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003037 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003038 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039}
3040
Victor Stinner600d3be2010-06-10 12:00:55 +00003041/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003042 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3043 1 on success. */
3044static int
3045normalize_encoding(const char *encoding,
3046 char *lower,
3047 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003048{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003049 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003050 char *l;
3051 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003052
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003053 if (encoding == NULL) {
3054 strcpy(lower, "utf-8");
3055 return 1;
3056 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003057 e = encoding;
3058 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003059 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003060 while (*e) {
3061 if (l == l_end)
3062 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003063 if (Py_ISUPPER(*e)) {
3064 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003065 }
3066 else if (*e == '_') {
3067 *l++ = '-';
3068 e++;
3069 }
3070 else {
3071 *l++ = *e++;
3072 }
3073 }
3074 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003075 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003076}
3077
Alexander Belopolsky40018472011-02-26 01:02:56 +00003078PyObject *
3079PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003080 Py_ssize_t size,
3081 const char *encoding,
3082 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003083{
3084 PyObject *buffer = NULL, *unicode;
3085 Py_buffer info;
3086 char lower[11]; /* Enough for any encoding shortcut */
3087
Fred Drakee4315f52000-05-09 19:53:39 +00003088 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003089 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003090 if ((strcmp(lower, "utf-8") == 0) ||
3091 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003092 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003093 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003094 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003095 (strcmp(lower, "iso-8859-1") == 0))
3096 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003097#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003098 else if (strcmp(lower, "mbcs") == 0)
3099 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003100#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003101 else if (strcmp(lower, "ascii") == 0)
3102 return PyUnicode_DecodeASCII(s, size, errors);
3103 else if (strcmp(lower, "utf-16") == 0)
3104 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3105 else if (strcmp(lower, "utf-32") == 0)
3106 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3107 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108
3109 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003110 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003111 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003112 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003113 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003114 if (buffer == NULL)
3115 goto onError;
3116 unicode = PyCodec_Decode(buffer, encoding, errors);
3117 if (unicode == NULL)
3118 goto onError;
3119 if (!PyUnicode_Check(unicode)) {
3120 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003121 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003122 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003123 Py_DECREF(unicode);
3124 goto onError;
3125 }
3126 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003127 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003128
Benjamin Peterson29060642009-01-31 22:14:21 +00003129 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130 Py_XDECREF(buffer);
3131 return NULL;
3132}
3133
Alexander Belopolsky40018472011-02-26 01:02:56 +00003134PyObject *
3135PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003136 const char *encoding,
3137 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003138{
3139 PyObject *v;
3140
3141 if (!PyUnicode_Check(unicode)) {
3142 PyErr_BadArgument();
3143 goto onError;
3144 }
3145
3146 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003147 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003148
3149 /* Decode via the codec registry */
3150 v = PyCodec_Decode(unicode, encoding, errors);
3151 if (v == NULL)
3152 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003153 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003154
Benjamin Peterson29060642009-01-31 22:14:21 +00003155 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003156 return NULL;
3157}
3158
Alexander Belopolsky40018472011-02-26 01:02:56 +00003159PyObject *
3160PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003161 const char *encoding,
3162 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003163{
3164 PyObject *v;
3165
3166 if (!PyUnicode_Check(unicode)) {
3167 PyErr_BadArgument();
3168 goto onError;
3169 }
3170
3171 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003172 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003173
3174 /* Decode via the codec registry */
3175 v = PyCodec_Decode(unicode, encoding, errors);
3176 if (v == NULL)
3177 goto onError;
3178 if (!PyUnicode_Check(v)) {
3179 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003180 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003181 Py_TYPE(v)->tp_name);
3182 Py_DECREF(v);
3183 goto onError;
3184 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003185 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003186
Benjamin Peterson29060642009-01-31 22:14:21 +00003187 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003188 return NULL;
3189}
3190
Alexander Belopolsky40018472011-02-26 01:02:56 +00003191PyObject *
3192PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003193 Py_ssize_t size,
3194 const char *encoding,
3195 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196{
3197 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003198
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 unicode = PyUnicode_FromUnicode(s, size);
3200 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003201 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3203 Py_DECREF(unicode);
3204 return v;
3205}
3206
Alexander Belopolsky40018472011-02-26 01:02:56 +00003207PyObject *
3208PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003209 const char *encoding,
3210 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003211{
3212 PyObject *v;
3213
3214 if (!PyUnicode_Check(unicode)) {
3215 PyErr_BadArgument();
3216 goto onError;
3217 }
3218
3219 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003220 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003221
3222 /* Encode via the codec registry */
3223 v = PyCodec_Encode(unicode, encoding, errors);
3224 if (v == NULL)
3225 goto onError;
3226 return v;
3227
Benjamin Peterson29060642009-01-31 22:14:21 +00003228 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003229 return NULL;
3230}
3231
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003232static size_t
3233wcstombs_errorpos(const wchar_t *wstr)
3234{
3235 size_t len;
3236#if SIZEOF_WCHAR_T == 2
3237 wchar_t buf[3];
3238#else
3239 wchar_t buf[2];
3240#endif
3241 char outbuf[MB_LEN_MAX];
3242 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003243
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003244#if SIZEOF_WCHAR_T == 2
3245 buf[2] = 0;
3246#else
3247 buf[1] = 0;
3248#endif
3249 start = wstr;
3250 while (*wstr != L'\0')
3251 {
3252 previous = wstr;
3253#if SIZEOF_WCHAR_T == 2
3254 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3255 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3256 {
3257 buf[0] = wstr[0];
3258 buf[1] = wstr[1];
3259 wstr += 2;
3260 }
3261 else {
3262 buf[0] = *wstr;
3263 buf[1] = 0;
3264 wstr++;
3265 }
3266#else
3267 buf[0] = *wstr;
3268 wstr++;
3269#endif
3270 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003271 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003272 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003273 }
3274
3275 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003276 return 0;
3277}
3278
Victor Stinner1b579672011-12-17 05:47:23 +01003279static int
3280locale_error_handler(const char *errors, int *surrogateescape)
3281{
3282 if (errors == NULL) {
3283 *surrogateescape = 0;
3284 return 0;
3285 }
3286
3287 if (strcmp(errors, "strict") == 0) {
3288 *surrogateescape = 0;
3289 return 0;
3290 }
3291 if (strcmp(errors, "surrogateescape") == 0) {
3292 *surrogateescape = 1;
3293 return 0;
3294 }
3295 PyErr_Format(PyExc_ValueError,
3296 "only 'strict' and 'surrogateescape' error handlers "
3297 "are supported, not '%s'",
3298 errors);
3299 return -1;
3300}
3301
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003302PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003303PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003304{
3305 Py_ssize_t wlen, wlen2;
3306 wchar_t *wstr;
3307 PyObject *bytes = NULL;
3308 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003309 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003310 PyObject *exc;
3311 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003312 int surrogateescape;
3313
3314 if (locale_error_handler(errors, &surrogateescape) < 0)
3315 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003316
3317 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3318 if (wstr == NULL)
3319 return NULL;
3320
3321 wlen2 = wcslen(wstr);
3322 if (wlen2 != wlen) {
3323 PyMem_Free(wstr);
3324 PyErr_SetString(PyExc_TypeError, "embedded null character");
3325 return NULL;
3326 }
3327
3328 if (surrogateescape) {
3329 /* locale encoding with surrogateescape */
3330 char *str;
3331
3332 str = _Py_wchar2char(wstr, &error_pos);
3333 if (str == NULL) {
3334 if (error_pos == (size_t)-1) {
3335 PyErr_NoMemory();
3336 PyMem_Free(wstr);
3337 return NULL;
3338 }
3339 else {
3340 goto encode_error;
3341 }
3342 }
3343 PyMem_Free(wstr);
3344
3345 bytes = PyBytes_FromString(str);
3346 PyMem_Free(str);
3347 }
3348 else {
3349 size_t len, len2;
3350
3351 len = wcstombs(NULL, wstr, 0);
3352 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003353 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003354 goto encode_error;
3355 }
3356
3357 bytes = PyBytes_FromStringAndSize(NULL, len);
3358 if (bytes == NULL) {
3359 PyMem_Free(wstr);
3360 return NULL;
3361 }
3362
3363 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3364 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003365 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003366 goto encode_error;
3367 }
3368 PyMem_Free(wstr);
3369 }
3370 return bytes;
3371
3372encode_error:
3373 errmsg = strerror(errno);
3374 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003375
3376 if (error_pos == (size_t)-1)
3377 error_pos = wcstombs_errorpos(wstr);
3378
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003379 PyMem_Free(wstr);
3380 Py_XDECREF(bytes);
3381
Victor Stinner2f197072011-12-17 07:08:30 +01003382 if (errmsg != NULL) {
3383 size_t errlen;
3384 wstr = _Py_char2wchar(errmsg, &errlen);
3385 if (wstr != NULL) {
3386 reason = PyUnicode_FromWideChar(wstr, errlen);
3387 PyMem_Free(wstr);
3388 } else
3389 errmsg = NULL;
3390 }
3391 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003392 reason = PyUnicode_FromString(
3393 "wcstombs() encountered an unencodable "
3394 "wide character");
3395 if (reason == NULL)
3396 return NULL;
3397
3398 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3399 "locale", unicode,
3400 (Py_ssize_t)error_pos,
3401 (Py_ssize_t)(error_pos+1),
3402 reason);
3403 Py_DECREF(reason);
3404 if (exc != NULL) {
3405 PyCodec_StrictErrors(exc);
3406 Py_XDECREF(exc);
3407 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003408 return NULL;
3409}
3410
Victor Stinnerad158722010-10-27 00:25:46 +00003411PyObject *
3412PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003413{
Victor Stinner99b95382011-07-04 14:23:54 +02003414#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003415 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003416#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003417 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003418#else
Victor Stinner793b5312011-04-27 00:24:21 +02003419 PyInterpreterState *interp = PyThreadState_GET()->interp;
3420 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3421 cannot use it to encode and decode filenames before it is loaded. Load
3422 the Python codec requires to encode at least its own filename. Use the C
3423 version of the locale codec until the codec registry is initialized and
3424 the Python codec is loaded.
3425
3426 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3427 cannot only rely on it: check also interp->fscodec_initialized for
3428 subinterpreters. */
3429 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003430 return PyUnicode_AsEncodedString(unicode,
3431 Py_FileSystemDefaultEncoding,
3432 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003433 }
3434 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003435 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003436 }
Victor Stinnerad158722010-10-27 00:25:46 +00003437#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003438}
3439
Alexander Belopolsky40018472011-02-26 01:02:56 +00003440PyObject *
3441PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003442 const char *encoding,
3443 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444{
3445 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003446 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003447
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448 if (!PyUnicode_Check(unicode)) {
3449 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003450 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451 }
Fred Drakee4315f52000-05-09 19:53:39 +00003452
Fred Drakee4315f52000-05-09 19:53:39 +00003453 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003454 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003455 if ((strcmp(lower, "utf-8") == 0) ||
3456 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003457 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003458 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003459 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003460 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003461 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003462 }
Victor Stinner37296e82010-06-10 13:36:23 +00003463 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003464 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003465 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003466 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003467#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003468 else if (strcmp(lower, "mbcs") == 0)
3469 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003470#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003471 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003472 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003473 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003474
3475 /* Encode via the codec registry */
3476 v = PyCodec_Encode(unicode, encoding, errors);
3477 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003478 return NULL;
3479
3480 /* The normal path */
3481 if (PyBytes_Check(v))
3482 return v;
3483
3484 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003485 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003486 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003487 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003488
3489 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3490 "encoder %s returned bytearray instead of bytes",
3491 encoding);
3492 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003493 Py_DECREF(v);
3494 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003495 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003496
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003497 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3498 Py_DECREF(v);
3499 return b;
3500 }
3501
3502 PyErr_Format(PyExc_TypeError,
3503 "encoder did not return a bytes object (type=%.400s)",
3504 Py_TYPE(v)->tp_name);
3505 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003506 return NULL;
3507}
3508
Alexander Belopolsky40018472011-02-26 01:02:56 +00003509PyObject *
3510PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003511 const char *encoding,
3512 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003513{
3514 PyObject *v;
3515
3516 if (!PyUnicode_Check(unicode)) {
3517 PyErr_BadArgument();
3518 goto onError;
3519 }
3520
3521 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003522 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003523
3524 /* Encode via the codec registry */
3525 v = PyCodec_Encode(unicode, encoding, errors);
3526 if (v == NULL)
3527 goto onError;
3528 if (!PyUnicode_Check(v)) {
3529 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003530 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003531 Py_TYPE(v)->tp_name);
3532 Py_DECREF(v);
3533 goto onError;
3534 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003536
Benjamin Peterson29060642009-01-31 22:14:21 +00003537 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538 return NULL;
3539}
3540
Victor Stinner2f197072011-12-17 07:08:30 +01003541static size_t
3542mbstowcs_errorpos(const char *str, size_t len)
3543{
3544#ifdef HAVE_MBRTOWC
3545 const char *start = str;
3546 mbstate_t mbs;
3547 size_t converted;
3548 wchar_t ch;
3549
3550 memset(&mbs, 0, sizeof mbs);
3551 while (len)
3552 {
3553 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3554 if (converted == 0)
3555 /* Reached end of string */
3556 break;
3557 if (converted == (size_t)-1 || converted == (size_t)-2) {
3558 /* Conversion error or incomplete character */
3559 return str - start;
3560 }
3561 else {
3562 str += converted;
3563 len -= converted;
3564 }
3565 }
3566 /* failed to find the undecodable byte sequence */
3567 return 0;
3568#endif
3569 return 0;
3570}
3571
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003572PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003573PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003574 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003575{
3576 wchar_t smallbuf[256];
3577 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3578 wchar_t *wstr;
3579 size_t wlen, wlen2;
3580 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003581 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003582 size_t error_pos;
3583 char *errmsg;
3584 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003585
3586 if (locale_error_handler(errors, &surrogateescape) < 0)
3587 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003588
3589 if (str[len] != '\0' || len != strlen(str)) {
3590 PyErr_SetString(PyExc_TypeError, "embedded null character");
3591 return NULL;
3592 }
3593
3594 if (surrogateescape)
3595 {
3596 wstr = _Py_char2wchar(str, &wlen);
3597 if (wstr == NULL) {
3598 if (wlen == (size_t)-1)
3599 PyErr_NoMemory();
3600 else
3601 PyErr_SetFromErrno(PyExc_OSError);
3602 return NULL;
3603 }
3604
3605 unicode = PyUnicode_FromWideChar(wstr, wlen);
3606 PyMem_Free(wstr);
3607 }
3608 else {
3609#ifndef HAVE_BROKEN_MBSTOWCS
3610 wlen = mbstowcs(NULL, str, 0);
3611#else
3612 wlen = len;
3613#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003614 if (wlen == (size_t)-1)
3615 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003616 if (wlen+1 <= smallbuf_len) {
3617 wstr = smallbuf;
3618 }
3619 else {
3620 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3621 return PyErr_NoMemory();
3622
3623 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3624 if (!wstr)
3625 return PyErr_NoMemory();
3626 }
3627
3628 /* This shouldn't fail now */
3629 wlen2 = mbstowcs(wstr, str, wlen+1);
3630 if (wlen2 == (size_t)-1) {
3631 if (wstr != smallbuf)
3632 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003633 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003634 }
3635#ifdef HAVE_BROKEN_MBSTOWCS
3636 assert(wlen2 == wlen);
3637#endif
3638 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3639 if (wstr != smallbuf)
3640 PyMem_Free(wstr);
3641 }
3642 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003643
3644decode_error:
3645 errmsg = strerror(errno);
3646 assert(errmsg != NULL);
3647
3648 error_pos = mbstowcs_errorpos(str, len);
3649 if (errmsg != NULL) {
3650 size_t errlen;
3651 wstr = _Py_char2wchar(errmsg, &errlen);
3652 if (wstr != NULL) {
3653 reason = PyUnicode_FromWideChar(wstr, errlen);
3654 PyMem_Free(wstr);
3655 } else
3656 errmsg = NULL;
3657 }
3658 if (errmsg == NULL)
3659 reason = PyUnicode_FromString(
3660 "mbstowcs() encountered an invalid multibyte sequence");
3661 if (reason == NULL)
3662 return NULL;
3663
3664 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3665 "locale", str, len,
3666 (Py_ssize_t)error_pos,
3667 (Py_ssize_t)(error_pos+1),
3668 reason);
3669 Py_DECREF(reason);
3670 if (exc != NULL) {
3671 PyCodec_StrictErrors(exc);
3672 Py_XDECREF(exc);
3673 }
3674 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003675}
3676
3677PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003678PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003679{
3680 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003681 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003682}
3683
3684
3685PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003686PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003687 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003688 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3689}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003690
Christian Heimes5894ba72007-11-04 11:43:14 +00003691PyObject*
3692PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3693{
Victor Stinner99b95382011-07-04 14:23:54 +02003694#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003695 return PyUnicode_DecodeMBCS(s, size, NULL);
3696#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003697 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003698#else
Victor Stinner793b5312011-04-27 00:24:21 +02003699 PyInterpreterState *interp = PyThreadState_GET()->interp;
3700 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3701 cannot use it to encode and decode filenames before it is loaded. Load
3702 the Python codec requires to encode at least its own filename. Use the C
3703 version of the locale codec until the codec registry is initialized and
3704 the Python codec is loaded.
3705
3706 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3707 cannot only rely on it: check also interp->fscodec_initialized for
3708 subinterpreters. */
3709 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003710 return PyUnicode_Decode(s, size,
3711 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003712 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003713 }
3714 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003715 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003716 }
Victor Stinnerad158722010-10-27 00:25:46 +00003717#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003718}
3719
Martin v. Löwis011e8422009-05-05 04:43:17 +00003720
3721int
Antoine Pitrou13348842012-01-29 18:36:34 +01003722_PyUnicode_HasNULChars(PyObject* s)
3723{
3724 static PyObject *nul = NULL;
3725
3726 if (nul == NULL)
3727 nul = PyUnicode_FromStringAndSize("\0", 1);
3728 if (nul == NULL)
3729 return -1;
3730 return PyUnicode_Contains(s, nul);
3731}
3732
3733
3734int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003735PyUnicode_FSConverter(PyObject* arg, void* addr)
3736{
3737 PyObject *output = NULL;
3738 Py_ssize_t size;
3739 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003740 if (arg == NULL) {
3741 Py_DECREF(*(PyObject**)addr);
3742 return 1;
3743 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003744 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003745 output = arg;
3746 Py_INCREF(output);
3747 }
3748 else {
3749 arg = PyUnicode_FromObject(arg);
3750 if (!arg)
3751 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003752 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003753 Py_DECREF(arg);
3754 if (!output)
3755 return 0;
3756 if (!PyBytes_Check(output)) {
3757 Py_DECREF(output);
3758 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3759 return 0;
3760 }
3761 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003762 size = PyBytes_GET_SIZE(output);
3763 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003764 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003765 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003766 Py_DECREF(output);
3767 return 0;
3768 }
3769 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003770 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003771}
3772
3773
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003774int
3775PyUnicode_FSDecoder(PyObject* arg, void* addr)
3776{
3777 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003778 if (arg == NULL) {
3779 Py_DECREF(*(PyObject**)addr);
3780 return 1;
3781 }
3782 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003783 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003784 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003785 output = arg;
3786 Py_INCREF(output);
3787 }
3788 else {
3789 arg = PyBytes_FromObject(arg);
3790 if (!arg)
3791 return 0;
3792 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3793 PyBytes_GET_SIZE(arg));
3794 Py_DECREF(arg);
3795 if (!output)
3796 return 0;
3797 if (!PyUnicode_Check(output)) {
3798 Py_DECREF(output);
3799 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3800 return 0;
3801 }
3802 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003803 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003804 Py_DECREF(output);
3805 return 0;
3806 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003807 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003808 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003809 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3810 Py_DECREF(output);
3811 return 0;
3812 }
3813 *(PyObject**)addr = output;
3814 return Py_CLEANUP_SUPPORTED;
3815}
3816
3817
Martin v. Löwis5b222132007-06-10 09:51:05 +00003818char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003819PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003820{
Christian Heimesf3863112007-11-22 07:46:41 +00003821 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003822
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003823 if (!PyUnicode_Check(unicode)) {
3824 PyErr_BadArgument();
3825 return NULL;
3826 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003827 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003828 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003830 if (PyUnicode_UTF8(unicode) == NULL) {
3831 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003832 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3833 if (bytes == NULL)
3834 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003835 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3836 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003837 Py_DECREF(bytes);
3838 return NULL;
3839 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003840 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3841 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3842 PyBytes_AS_STRING(bytes),
3843 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003844 Py_DECREF(bytes);
3845 }
3846
3847 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003848 *psize = PyUnicode_UTF8_LENGTH(unicode);
3849 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003850}
3851
3852char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003853PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003854{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003855 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3856}
3857
3858#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003859static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003860#endif
3861
3862
3863Py_UNICODE *
3864PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3865{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866 const unsigned char *one_byte;
3867#if SIZEOF_WCHAR_T == 4
3868 const Py_UCS2 *two_bytes;
3869#else
3870 const Py_UCS4 *four_bytes;
3871 const Py_UCS4 *ucs4_end;
3872 Py_ssize_t num_surrogates;
3873#endif
3874 wchar_t *w;
3875 wchar_t *wchar_end;
3876
3877 if (!PyUnicode_Check(unicode)) {
3878 PyErr_BadArgument();
3879 return NULL;
3880 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003881 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003882 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003883 assert(_PyUnicode_KIND(unicode) != 0);
3884 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003885
3886#ifdef Py_DEBUG
3887 ++unicode_as_unicode_calls;
3888#endif
3889
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003890 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003891#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003892 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3893 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003894 num_surrogates = 0;
3895
3896 for (; four_bytes < ucs4_end; ++four_bytes) {
3897 if (*four_bytes > 0xFFFF)
3898 ++num_surrogates;
3899 }
3900
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003901 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3902 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3903 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003904 PyErr_NoMemory();
3905 return NULL;
3906 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003907 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003909 w = _PyUnicode_WSTR(unicode);
3910 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3911 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003912 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3913 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003914 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003916 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3917 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003918 }
3919 else
3920 *w = *four_bytes;
3921
3922 if (w > wchar_end) {
3923 assert(0 && "Miscalculated string end");
3924 }
3925 }
3926 *w = 0;
3927#else
3928 /* sizeof(wchar_t) == 4 */
3929 Py_FatalError("Impossible unicode object state, wstr and str "
3930 "should share memory already.");
3931 return NULL;
3932#endif
3933 }
3934 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003935 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3936 (_PyUnicode_LENGTH(unicode) + 1));
3937 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003938 PyErr_NoMemory();
3939 return NULL;
3940 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003941 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3942 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3943 w = _PyUnicode_WSTR(unicode);
3944 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003945
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003946 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3947 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 for (; w < wchar_end; ++one_byte, ++w)
3949 *w = *one_byte;
3950 /* null-terminate the wstr */
3951 *w = 0;
3952 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003953 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003954#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003955 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003956 for (; w < wchar_end; ++two_bytes, ++w)
3957 *w = *two_bytes;
3958 /* null-terminate the wstr */
3959 *w = 0;
3960#else
3961 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003962 PyObject_FREE(_PyUnicode_WSTR(unicode));
3963 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964 Py_FatalError("Impossible unicode object state, wstr "
3965 "and str should share memory already.");
3966 return NULL;
3967#endif
3968 }
3969 else {
3970 assert(0 && "This should never happen.");
3971 }
3972 }
3973 }
3974 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003975 *size = PyUnicode_WSTR_LENGTH(unicode);
3976 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003977}
3978
Alexander Belopolsky40018472011-02-26 01:02:56 +00003979Py_UNICODE *
3980PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003981{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003982 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983}
3984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003985
Alexander Belopolsky40018472011-02-26 01:02:56 +00003986Py_ssize_t
3987PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988{
3989 if (!PyUnicode_Check(unicode)) {
3990 PyErr_BadArgument();
3991 goto onError;
3992 }
3993 return PyUnicode_GET_SIZE(unicode);
3994
Benjamin Peterson29060642009-01-31 22:14:21 +00003995 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 return -1;
3997}
3998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003999Py_ssize_t
4000PyUnicode_GetLength(PyObject *unicode)
4001{
Victor Stinner5a706cf2011-10-02 00:36:53 +02004002 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003 PyErr_BadArgument();
4004 return -1;
4005 }
4006
4007 return PyUnicode_GET_LENGTH(unicode);
4008}
4009
4010Py_UCS4
4011PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4012{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004013 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4014 PyErr_BadArgument();
4015 return (Py_UCS4)-1;
4016 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004017 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004018 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004019 return (Py_UCS4)-1;
4020 }
4021 return PyUnicode_READ_CHAR(unicode, index);
4022}
4023
4024int
4025PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4026{
4027 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004028 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004029 return -1;
4030 }
Victor Stinner488fa492011-12-12 00:01:39 +01004031 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004032 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004033 PyErr_SetString(PyExc_IndexError, "string index out of range");
4034 return -1;
4035 }
Victor Stinner488fa492011-12-12 00:01:39 +01004036 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004037 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004038 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4039 PyErr_SetString(PyExc_ValueError, "character out of range");
4040 return -1;
4041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004042 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4043 index, ch);
4044 return 0;
4045}
4046
Alexander Belopolsky40018472011-02-26 01:02:56 +00004047const char *
4048PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004049{
Victor Stinner42cb4622010-09-01 19:39:01 +00004050 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004051}
4052
Victor Stinner554f3f02010-06-16 23:33:54 +00004053/* create or adjust a UnicodeDecodeError */
4054static void
4055make_decode_exception(PyObject **exceptionObject,
4056 const char *encoding,
4057 const char *input, Py_ssize_t length,
4058 Py_ssize_t startpos, Py_ssize_t endpos,
4059 const char *reason)
4060{
4061 if (*exceptionObject == NULL) {
4062 *exceptionObject = PyUnicodeDecodeError_Create(
4063 encoding, input, length, startpos, endpos, reason);
4064 }
4065 else {
4066 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4067 goto onError;
4068 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4069 goto onError;
4070 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4071 goto onError;
4072 }
4073 return;
4074
4075onError:
4076 Py_DECREF(*exceptionObject);
4077 *exceptionObject = NULL;
4078}
4079
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080/* error handling callback helper:
4081 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004082 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004083 and adjust various state variables.
4084 return 0 on success, -1 on error
4085*/
4086
Alexander Belopolsky40018472011-02-26 01:02:56 +00004087static int
4088unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004089 const char *encoding, const char *reason,
4090 const char **input, const char **inend, Py_ssize_t *startinpos,
4091 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004092 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004093{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004094 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004095
4096 PyObject *restuple = NULL;
4097 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004098 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004099 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004100 Py_ssize_t requiredsize;
4101 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004102 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004103 int res = -1;
4104
Victor Stinner596a6c42011-11-09 00:02:18 +01004105 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4106 outsize = PyUnicode_GET_LENGTH(*output);
4107 else
4108 outsize = _PyUnicode_WSTR_LENGTH(*output);
4109
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004111 *errorHandler = PyCodec_LookupError(errors);
4112 if (*errorHandler == NULL)
4113 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004114 }
4115
Victor Stinner554f3f02010-06-16 23:33:54 +00004116 make_decode_exception(exceptionObject,
4117 encoding,
4118 *input, *inend - *input,
4119 *startinpos, *endinpos,
4120 reason);
4121 if (*exceptionObject == NULL)
4122 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123
4124 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4125 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004126 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004128 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004130 }
4131 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004132 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004133 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004134 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004135
4136 /* Copy back the bytes variables, which might have been modified by the
4137 callback */
4138 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4139 if (!inputobj)
4140 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004141 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004142 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004143 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004144 *input = PyBytes_AS_STRING(inputobj);
4145 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004146 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004147 /* we can DECREF safely, as the exception has another reference,
4148 so the object won't go away. */
4149 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004151 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004152 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004153 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004154 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4155 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004156 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157
Victor Stinner596a6c42011-11-09 00:02:18 +01004158 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4159 /* need more space? (at least enough for what we
4160 have+the replacement+the rest of the string (starting
4161 at the new input position), so we won't have to check space
4162 when there are no errors in the rest of the string) */
4163 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4164 requiredsize = *outpos + replen + insize-newpos;
4165 if (requiredsize > outsize) {
4166 if (requiredsize<2*outsize)
4167 requiredsize = 2*outsize;
4168 if (unicode_resize(output, requiredsize) < 0)
4169 goto onError;
4170 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004171 if (unicode_widen(output, *outpos,
4172 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004173 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004174 copy_characters(*output, *outpos, repunicode, 0, replen);
4175 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004177 else {
4178 wchar_t *repwstr;
4179 Py_ssize_t repwlen;
4180 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4181 if (repwstr == NULL)
4182 goto onError;
4183 /* need more space? (at least enough for what we
4184 have+the replacement+the rest of the string (starting
4185 at the new input position), so we won't have to check space
4186 when there are no errors in the rest of the string) */
4187 requiredsize = *outpos + repwlen + insize-newpos;
4188 if (requiredsize > outsize) {
4189 if (requiredsize < 2*outsize)
4190 requiredsize = 2*outsize;
4191 if (unicode_resize(output, requiredsize) < 0)
4192 goto onError;
4193 }
4194 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4195 *outpos += repwlen;
4196 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004197 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004198 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004199
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200 /* we made it! */
4201 res = 0;
4202
Benjamin Peterson29060642009-01-31 22:14:21 +00004203 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004204 Py_XDECREF(restuple);
4205 return res;
4206}
4207
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004208/* --- UTF-7 Codec -------------------------------------------------------- */
4209
Antoine Pitrou244651a2009-05-04 18:56:13 +00004210/* See RFC2152 for details. We encode conservatively and decode liberally. */
4211
4212/* Three simple macros defining base-64. */
4213
4214/* Is c a base-64 character? */
4215
4216#define IS_BASE64(c) \
4217 (((c) >= 'A' && (c) <= 'Z') || \
4218 ((c) >= 'a' && (c) <= 'z') || \
4219 ((c) >= '0' && (c) <= '9') || \
4220 (c) == '+' || (c) == '/')
4221
4222/* given that c is a base-64 character, what is its base-64 value? */
4223
4224#define FROM_BASE64(c) \
4225 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4226 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4227 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4228 (c) == '+' ? 62 : 63)
4229
4230/* What is the base-64 character of the bottom 6 bits of n? */
4231
4232#define TO_BASE64(n) \
4233 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4234
4235/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4236 * decoded as itself. We are permissive on decoding; the only ASCII
4237 * byte not decoding to itself is the + which begins a base64
4238 * string. */
4239
4240#define DECODE_DIRECT(c) \
4241 ((c) <= 127 && (c) != '+')
4242
4243/* The UTF-7 encoder treats ASCII characters differently according to
4244 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4245 * the above). See RFC2152. This array identifies these different
4246 * sets:
4247 * 0 : "Set D"
4248 * alphanumeric and '(),-./:?
4249 * 1 : "Set O"
4250 * !"#$%&*;<=>@[]^_`{|}
4251 * 2 : "whitespace"
4252 * ht nl cr sp
4253 * 3 : special (must be base64 encoded)
4254 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4255 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004256
Tim Petersced69f82003-09-16 20:30:58 +00004257static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004258char utf7_category[128] = {
4259/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4260 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4261/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4262 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4263/* sp ! " # $ % & ' ( ) * + , - . / */
4264 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4265/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4267/* @ A B C D E F G H I J K L M N O */
4268 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4269/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4270 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4271/* ` a b c d e f g h i j k l m n o */
4272 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4273/* p q r s t u v w x y z { | } ~ del */
4274 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004275};
4276
Antoine Pitrou244651a2009-05-04 18:56:13 +00004277/* ENCODE_DIRECT: this character should be encoded as itself. The
4278 * answer depends on whether we are encoding set O as itself, and also
4279 * on whether we are encoding whitespace as itself. RFC2152 makes it
4280 * clear that the answers to these questions vary between
4281 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004282
Antoine Pitrou244651a2009-05-04 18:56:13 +00004283#define ENCODE_DIRECT(c, directO, directWS) \
4284 ((c) < 128 && (c) > 0 && \
4285 ((utf7_category[(c)] == 0) || \
4286 (directWS && (utf7_category[(c)] == 2)) || \
4287 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004288
Alexander Belopolsky40018472011-02-26 01:02:56 +00004289PyObject *
4290PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004291 Py_ssize_t size,
4292 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004293{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004294 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4295}
4296
Antoine Pitrou244651a2009-05-04 18:56:13 +00004297/* The decoder. The only state we preserve is our read position,
4298 * i.e. how many characters we have consumed. So if we end in the
4299 * middle of a shift sequence we have to back off the read position
4300 * and the output to the beginning of the sequence, otherwise we lose
4301 * all the shift state (seen bits, number of bits seen, high
4302 * surrogate). */
4303
Alexander Belopolsky40018472011-02-26 01:02:56 +00004304PyObject *
4305PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004306 Py_ssize_t size,
4307 const char *errors,
4308 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004309{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004310 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004311 Py_ssize_t startinpos;
4312 Py_ssize_t endinpos;
4313 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004314 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004315 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004316 const char *errmsg = "";
4317 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004318 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004319 unsigned int base64bits = 0;
4320 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004321 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004322 PyObject *errorHandler = NULL;
4323 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004324
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004325 /* Start off assuming it's all ASCII. Widen later as necessary. */
4326 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004327 if (!unicode)
4328 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004329 if (size == 0) {
4330 if (consumed)
4331 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004332 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004333 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004335 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004336 e = s + size;
4337
4338 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004339 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004340 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004341 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004342
Antoine Pitrou244651a2009-05-04 18:56:13 +00004343 if (inShift) { /* in a base-64 section */
4344 if (IS_BASE64(ch)) { /* consume a base-64 character */
4345 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4346 base64bits += 6;
4347 s++;
4348 if (base64bits >= 16) {
4349 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004350 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004351 base64bits -= 16;
4352 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4353 if (surrogate) {
4354 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004355 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4356 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004357 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4358 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004360 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004361 }
4362 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004363 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4364 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004366 }
4367 }
Victor Stinner551ac952011-11-29 22:58:13 +01004368 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004369 /* first surrogate */
4370 surrogate = outCh;
4371 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004372 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004373 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4374 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004375 }
4376 }
4377 }
4378 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004379 inShift = 0;
4380 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004382 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4383 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004384 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004385 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004386 if (base64bits > 0) { /* left-over bits */
4387 if (base64bits >= 6) {
4388 /* We've seen at least one base-64 character */
4389 errmsg = "partial character in shift sequence";
4390 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004391 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004392 else {
4393 /* Some bits remain; they should be zero */
4394 if (base64buffer != 0) {
4395 errmsg = "non-zero padding bits in shift sequence";
4396 goto utf7Error;
4397 }
4398 }
4399 }
4400 if (ch != '-') {
4401 /* '-' is absorbed; other terminating
4402 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004403 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4404 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004406 }
4407 }
4408 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004410 s++; /* consume '+' */
4411 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004412 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004413 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4414 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 }
4416 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004418 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004420 }
4421 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004422 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004423 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4424 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004425 s++;
4426 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004427 else {
4428 startinpos = s-starts;
4429 s++;
4430 errmsg = "unexpected special character";
4431 goto utf7Error;
4432 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004433 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004434utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 endinpos = s-starts;
4436 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004437 errors, &errorHandler,
4438 "utf7", errmsg,
4439 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004440 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004441 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004442 }
4443
Antoine Pitrou244651a2009-05-04 18:56:13 +00004444 /* end of string */
4445
4446 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4447 /* if we're in an inconsistent state, that's an error */
4448 if (surrogate ||
4449 (base64bits >= 6) ||
4450 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 endinpos = size;
4452 if (unicode_decode_call_errorhandler(
4453 errors, &errorHandler,
4454 "utf7", "unterminated shift sequence",
4455 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004456 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004457 goto onError;
4458 if (s < e)
4459 goto restart;
4460 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004461 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462
4463 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004464 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004465 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004466 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004467 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004468 }
4469 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004470 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004472 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004473
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004474 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004475 goto onError;
4476
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004477 Py_XDECREF(errorHandler);
4478 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004479 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480
Benjamin Peterson29060642009-01-31 22:14:21 +00004481 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004482 Py_XDECREF(errorHandler);
4483 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484 Py_DECREF(unicode);
4485 return NULL;
4486}
4487
4488
Alexander Belopolsky40018472011-02-26 01:02:56 +00004489PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004490_PyUnicode_EncodeUTF7(PyObject *str,
4491 int base64SetO,
4492 int base64WhiteSpace,
4493 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004494{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004495 int kind;
4496 void *data;
4497 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004498 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004499 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004500 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004501 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 unsigned int base64bits = 0;
4503 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004504 char * out;
4505 char * start;
4506
Benjamin Petersonbac79492012-01-14 13:34:47 -05004507 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004508 return NULL;
4509 kind = PyUnicode_KIND(str);
4510 data = PyUnicode_DATA(str);
4511 len = PyUnicode_GET_LENGTH(str);
4512
4513 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004515
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004516 /* It might be possible to tighten this worst case */
4517 allocated = 8 * len;
4518 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004519 return PyErr_NoMemory();
4520
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004522 if (v == NULL)
4523 return NULL;
4524
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004525 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004526 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004527 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528
Antoine Pitrou244651a2009-05-04 18:56:13 +00004529 if (inShift) {
4530 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4531 /* shifting out */
4532 if (base64bits) { /* output remaining bits */
4533 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4534 base64buffer = 0;
4535 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536 }
4537 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004538 /* Characters not in the BASE64 set implicitly unshift the sequence
4539 so no '-' is required, except if the character is itself a '-' */
4540 if (IS_BASE64(ch) || ch == '-') {
4541 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 *out++ = (char) ch;
4544 }
4545 else {
4546 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004547 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004548 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004549 else { /* not in a shift sequence */
4550 if (ch == '+') {
4551 *out++ = '+';
4552 *out++ = '-';
4553 }
4554 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4555 *out++ = (char) ch;
4556 }
4557 else {
4558 *out++ = '+';
4559 inShift = 1;
4560 goto encode_char;
4561 }
4562 }
4563 continue;
4564encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004566 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004567
Antoine Pitrou244651a2009-05-04 18:56:13 +00004568 /* code first surrogate */
4569 base64bits += 16;
4570 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4571 while (base64bits >= 6) {
4572 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4573 base64bits -= 6;
4574 }
4575 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004576 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004577 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 base64bits += 16;
4579 base64buffer = (base64buffer << 16) | ch;
4580 while (base64bits >= 6) {
4581 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4582 base64bits -= 6;
4583 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004584 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 if (base64bits)
4586 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4587 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004588 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004589 if (_PyBytes_Resize(&v, out - start) < 0)
4590 return NULL;
4591 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004592}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004593PyObject *
4594PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4595 Py_ssize_t size,
4596 int base64SetO,
4597 int base64WhiteSpace,
4598 const char *errors)
4599{
4600 PyObject *result;
4601 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4602 if (tmp == NULL)
4603 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004604 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004605 base64WhiteSpace, errors);
4606 Py_DECREF(tmp);
4607 return result;
4608}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004609
Antoine Pitrou244651a2009-05-04 18:56:13 +00004610#undef IS_BASE64
4611#undef FROM_BASE64
4612#undef TO_BASE64
4613#undef DECODE_DIRECT
4614#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004615
Guido van Rossumd57fd912000-03-10 22:53:23 +00004616/* --- UTF-8 Codec -------------------------------------------------------- */
4617
Alexander Belopolsky40018472011-02-26 01:02:56 +00004618PyObject *
4619PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004620 Py_ssize_t size,
4621 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622{
Walter Dörwald69652032004-09-07 20:24:22 +00004623 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4624}
4625
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004626#include "stringlib/asciilib.h"
4627#include "stringlib/codecs.h"
4628#include "stringlib/undef.h"
4629
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004630#include "stringlib/ucs1lib.h"
4631#include "stringlib/codecs.h"
4632#include "stringlib/undef.h"
4633
4634#include "stringlib/ucs2lib.h"
4635#include "stringlib/codecs.h"
4636#include "stringlib/undef.h"
4637
4638#include "stringlib/ucs4lib.h"
4639#include "stringlib/codecs.h"
4640#include "stringlib/undef.h"
4641
Antoine Pitrouab868312009-01-10 15:40:25 +00004642/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4643#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4644
4645/* Mask to quickly check whether a C 'long' contains a
4646 non-ASCII, UTF8-encoded char. */
4647#if (SIZEOF_LONG == 8)
4648# define ASCII_CHAR_MASK 0x8080808080808080L
4649#elif (SIZEOF_LONG == 4)
4650# define ASCII_CHAR_MASK 0x80808080L
4651#else
4652# error C 'long' size should be either 4 or 8!
4653#endif
4654
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004655static Py_ssize_t
4656ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004657{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004658 const char *p = start;
4659 const char *aligned_end = (const char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004660
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004661#if SIZEOF_LONG <= SIZEOF_VOID_P
4662 assert(!((size_t) dest & LONG_PTR_MASK));
4663 if (!((size_t) p & LONG_PTR_MASK)) {
4664 /* Fast path, see in STRINGLIB(utf8_decode) for
4665 an explanation. */
4666 /* Help register allocation */
4667 register const char *_p = p;
4668 register Py_UCS1 * q = dest;
4669 while (_p < aligned_end) {
4670 unsigned long value = *(const unsigned long *) _p;
4671 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004672 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004673 *((unsigned long *)q) = value;
4674 _p += SIZEOF_LONG;
4675 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004676 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004677 p = _p;
4678 while (p < end) {
4679 if ((unsigned char)*p & 0x80)
4680 break;
4681 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004683 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004685#endif
4686 while (p < end) {
4687 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4688 for an explanation. */
4689 if (!((size_t) p & LONG_PTR_MASK)) {
4690 /* Help register allocation */
4691 register const char *_p = p;
4692 while (_p < aligned_end) {
4693 unsigned long value = *(unsigned long *) _p;
4694 if (value & ASCII_CHAR_MASK)
4695 break;
4696 _p += SIZEOF_LONG;
4697 }
4698 p = _p;
4699 if (_p == end)
4700 break;
4701 }
4702 if ((unsigned char)*p & 0x80)
4703 break;
4704 ++p;
4705 }
4706 memcpy(dest, start, p - start);
4707 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708}
Antoine Pitrouab868312009-01-10 15:40:25 +00004709
Victor Stinner785938e2011-12-11 20:09:03 +01004710PyObject *
4711PyUnicode_DecodeUTF8Stateful(const char *s,
4712 Py_ssize_t size,
4713 const char *errors,
4714 Py_ssize_t *consumed)
4715{
Victor Stinner785938e2011-12-11 20:09:03 +01004716 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004717 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004718 const char *end = s + size;
4719 Py_ssize_t outpos;
4720
4721 Py_ssize_t startinpos;
4722 Py_ssize_t endinpos;
4723 const char *errmsg = "";
4724 PyObject *errorHandler = NULL;
4725 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004726
4727 if (size == 0) {
4728 if (consumed)
4729 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004730 Py_INCREF(unicode_empty);
4731 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004732 }
4733
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4735 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004736 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004737 *consumed = 1;
4738 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004739 }
4740
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004741 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004742 if (!unicode)
4743 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004744
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004745 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4746 s += outpos;
4747 while (s < end) {
4748 Py_UCS4 ch;
4749 int kind = PyUnicode_KIND(unicode);
4750 if (kind == PyUnicode_1BYTE_KIND) {
4751 if (PyUnicode_IS_ASCII(unicode))
4752 ch = asciilib_utf8_decode(&s, end,
4753 PyUnicode_1BYTE_DATA(unicode), &outpos);
4754 else
4755 ch = ucs1lib_utf8_decode(&s, end,
4756 PyUnicode_1BYTE_DATA(unicode), &outpos);
4757 } else if (kind == PyUnicode_2BYTE_KIND) {
4758 ch = ucs2lib_utf8_decode(&s, end,
4759 PyUnicode_2BYTE_DATA(unicode), &outpos);
4760 } else {
4761 assert(kind == PyUnicode_4BYTE_KIND);
4762 ch = ucs4lib_utf8_decode(&s, end,
4763 PyUnicode_4BYTE_DATA(unicode), &outpos);
4764 }
4765
4766 switch (ch) {
4767 case 0:
4768 if (s == end || consumed)
4769 goto End;
4770 errmsg = "unexpected end of data";
4771 startinpos = s - starts;
4772 endinpos = startinpos + 1;
4773 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4774 endinpos++;
4775 break;
4776 case 1:
4777 errmsg = "invalid start byte";
4778 startinpos = s - starts;
4779 endinpos = startinpos + 1;
4780 break;
4781 case 2:
4782 errmsg = "invalid continuation byte";
4783 startinpos = s - starts;
4784 endinpos = startinpos + 1;
4785 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4786 endinpos++;
4787 break;
4788 default:
4789 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4790 goto onError;
4791 continue;
4792 }
4793
4794 if (unicode_decode_call_errorhandler(
4795 errors, &errorHandler,
4796 "utf-8", errmsg,
4797 &starts, &end, &startinpos, &endinpos, &exc, &s,
4798 &unicode, &outpos))
4799 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004800 }
4801
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004802End:
4803 if (unicode_resize(&unicode, outpos) < 0)
4804 goto onError;
4805
4806 if (consumed)
4807 *consumed = s - starts;
4808
4809 Py_XDECREF(errorHandler);
4810 Py_XDECREF(exc);
4811 assert(_PyUnicode_CheckConsistency(unicode, 1));
4812 return unicode;
4813
4814onError:
4815 Py_XDECREF(errorHandler);
4816 Py_XDECREF(exc);
4817 Py_XDECREF(unicode);
4818 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004819}
4820
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004821#ifdef __APPLE__
4822
4823/* Simplified UTF-8 decoder using surrogateescape error handler,
4824 used to decode the command line arguments on Mac OS X. */
4825
4826wchar_t*
4827_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4828{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004829 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004830 wchar_t *unicode;
4831 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004832
4833 /* Note: size will always be longer than the resulting Unicode
4834 character count */
4835 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4836 PyErr_NoMemory();
4837 return NULL;
4838 }
4839 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4840 if (!unicode)
4841 return NULL;
4842
4843 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004844 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004845 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004846 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004847 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004848#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004850#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004851 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004852#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004853 if (ch > 0xFF) {
4854#if SIZEOF_WCHAR_T == 4
4855 assert(0);
4856#else
4857 assert(Py_UNICODE_IS_SURROGATE(ch));
4858 /* compute and append the two surrogates: */
4859 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4860 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4861#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004862 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004863 else {
4864 if (!ch && s == e)
4865 break;
4866 /* surrogateescape */
4867 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4868 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004869 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004870 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004871 return unicode;
4872}
4873
4874#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004876/* Primary internal function which creates utf8 encoded bytes objects.
4877
4878 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004879 and allocate exactly as much space needed at the end. Else allocate the
4880 maximum possible needed (4 result bytes per Unicode character), and return
4881 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004882*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004883PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004884_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885{
Victor Stinner6099a032011-12-18 14:22:26 +01004886 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004887 void *data;
4888 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004889
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004890 if (!PyUnicode_Check(unicode)) {
4891 PyErr_BadArgument();
4892 return NULL;
4893 }
4894
4895 if (PyUnicode_READY(unicode) == -1)
4896 return NULL;
4897
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004898 if (PyUnicode_UTF8(unicode))
4899 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4900 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004901
4902 kind = PyUnicode_KIND(unicode);
4903 data = PyUnicode_DATA(unicode);
4904 size = PyUnicode_GET_LENGTH(unicode);
4905
Benjamin Petersonead6b532011-12-20 17:23:42 -06004906 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004907 default:
4908 assert(0);
4909 case PyUnicode_1BYTE_KIND:
4910 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4911 assert(!PyUnicode_IS_ASCII(unicode));
4912 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4913 case PyUnicode_2BYTE_KIND:
4914 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4915 case PyUnicode_4BYTE_KIND:
4916 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004917 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918}
4919
Alexander Belopolsky40018472011-02-26 01:02:56 +00004920PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004921PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4922 Py_ssize_t size,
4923 const char *errors)
4924{
4925 PyObject *v, *unicode;
4926
4927 unicode = PyUnicode_FromUnicode(s, size);
4928 if (unicode == NULL)
4929 return NULL;
4930 v = _PyUnicode_AsUTF8String(unicode, errors);
4931 Py_DECREF(unicode);
4932 return v;
4933}
4934
4935PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004936PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004938 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939}
4940
Walter Dörwald41980ca2007-08-16 21:55:45 +00004941/* --- UTF-32 Codec ------------------------------------------------------- */
4942
4943PyObject *
4944PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004945 Py_ssize_t size,
4946 const char *errors,
4947 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004948{
4949 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4950}
4951
4952PyObject *
4953PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 Py_ssize_t size,
4955 const char *errors,
4956 int *byteorder,
4957 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004958{
4959 const char *starts = s;
4960 Py_ssize_t startinpos;
4961 Py_ssize_t endinpos;
4962 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004963 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004964 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004965 int bo = 0; /* assume native ordering by default */
4966 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004967 /* Offsets from q for retrieving bytes in the right order. */
4968#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4969 int iorder[] = {0, 1, 2, 3};
4970#else
4971 int iorder[] = {3, 2, 1, 0};
4972#endif
4973 PyObject *errorHandler = NULL;
4974 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004975
Walter Dörwald41980ca2007-08-16 21:55:45 +00004976 q = (unsigned char *)s;
4977 e = q + size;
4978
4979 if (byteorder)
4980 bo = *byteorder;
4981
4982 /* Check for BOM marks (U+FEFF) in the input and adjust current
4983 byte order setting accordingly. In native mode, the leading BOM
4984 mark is skipped, in all other modes, it is copied to the output
4985 stream as-is (giving a ZWNBSP character). */
4986 if (bo == 0) {
4987 if (size >= 4) {
4988 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004989 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004990#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004991 if (bom == 0x0000FEFF) {
4992 q += 4;
4993 bo = -1;
4994 }
4995 else if (bom == 0xFFFE0000) {
4996 q += 4;
4997 bo = 1;
4998 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004999#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005000 if (bom == 0x0000FEFF) {
5001 q += 4;
5002 bo = 1;
5003 }
5004 else if (bom == 0xFFFE0000) {
5005 q += 4;
5006 bo = -1;
5007 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005008#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005010 }
5011
5012 if (bo == -1) {
5013 /* force LE */
5014 iorder[0] = 0;
5015 iorder[1] = 1;
5016 iorder[2] = 2;
5017 iorder[3] = 3;
5018 }
5019 else if (bo == 1) {
5020 /* force BE */
5021 iorder[0] = 3;
5022 iorder[1] = 2;
5023 iorder[2] = 1;
5024 iorder[3] = 0;
5025 }
5026
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005027 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005028 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005029 if (!unicode)
5030 return NULL;
5031 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005032 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005033 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005034
Walter Dörwald41980ca2007-08-16 21:55:45 +00005035 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005036 Py_UCS4 ch;
5037 /* remaining bytes at the end? (size should be divisible by 4) */
5038 if (e-q<4) {
5039 if (consumed)
5040 break;
5041 errmsg = "truncated data";
5042 startinpos = ((const char *)q)-starts;
5043 endinpos = ((const char *)e)-starts;
5044 goto utf32Error;
5045 /* The remaining input chars are ignored if the callback
5046 chooses to skip the input */
5047 }
5048 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5049 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005050
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 if (ch >= 0x110000)
5052 {
5053 errmsg = "codepoint not in range(0x110000)";
5054 startinpos = ((const char *)q)-starts;
5055 endinpos = startinpos+4;
5056 goto utf32Error;
5057 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005058 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5059 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 q += 4;
5061 continue;
5062 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 if (unicode_decode_call_errorhandler(
5064 errors, &errorHandler,
5065 "utf32", errmsg,
5066 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005067 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005069 }
5070
5071 if (byteorder)
5072 *byteorder = bo;
5073
5074 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005076
5077 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005078 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005079 goto onError;
5080
5081 Py_XDECREF(errorHandler);
5082 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005083 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005084
Benjamin Peterson29060642009-01-31 22:14:21 +00005085 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005086 Py_DECREF(unicode);
5087 Py_XDECREF(errorHandler);
5088 Py_XDECREF(exc);
5089 return NULL;
5090}
5091
5092PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005093_PyUnicode_EncodeUTF32(PyObject *str,
5094 const char *errors,
5095 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005096{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005097 int kind;
5098 void *data;
5099 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005100 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005101 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005102 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005103 /* Offsets from p for storing byte pairs in the right order. */
5104#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5105 int iorder[] = {0, 1, 2, 3};
5106#else
5107 int iorder[] = {3, 2, 1, 0};
5108#endif
5109
Benjamin Peterson29060642009-01-31 22:14:21 +00005110#define STORECHAR(CH) \
5111 do { \
5112 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5113 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5114 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5115 p[iorder[0]] = (CH) & 0xff; \
5116 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005117 } while(0)
5118
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005119 if (!PyUnicode_Check(str)) {
5120 PyErr_BadArgument();
5121 return NULL;
5122 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005123 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005124 return NULL;
5125 kind = PyUnicode_KIND(str);
5126 data = PyUnicode_DATA(str);
5127 len = PyUnicode_GET_LENGTH(str);
5128
5129 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005130 bytesize = nsize * 4;
5131 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005133 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005134 if (v == NULL)
5135 return NULL;
5136
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005137 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005138 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005139 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005140 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005141 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005142
5143 if (byteorder == -1) {
5144 /* force LE */
5145 iorder[0] = 0;
5146 iorder[1] = 1;
5147 iorder[2] = 2;
5148 iorder[3] = 3;
5149 }
5150 else if (byteorder == 1) {
5151 /* force BE */
5152 iorder[0] = 3;
5153 iorder[1] = 2;
5154 iorder[2] = 1;
5155 iorder[3] = 0;
5156 }
5157
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005158 for (i = 0; i < len; i++)
5159 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005160
5161 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005162 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005163#undef STORECHAR
5164}
5165
Alexander Belopolsky40018472011-02-26 01:02:56 +00005166PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005167PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5168 Py_ssize_t size,
5169 const char *errors,
5170 int byteorder)
5171{
5172 PyObject *result;
5173 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5174 if (tmp == NULL)
5175 return NULL;
5176 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5177 Py_DECREF(tmp);
5178 return result;
5179}
5180
5181PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005182PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005183{
Victor Stinnerb960b342011-11-20 19:12:52 +01005184 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005185}
5186
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187/* --- UTF-16 Codec ------------------------------------------------------- */
5188
Tim Peters772747b2001-08-09 22:21:55 +00005189PyObject *
5190PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 Py_ssize_t size,
5192 const char *errors,
5193 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194{
Walter Dörwald69652032004-09-07 20:24:22 +00005195 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5196}
5197
5198PyObject *
5199PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005200 Py_ssize_t size,
5201 const char *errors,
5202 int *byteorder,
5203 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005204{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005205 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005206 Py_ssize_t startinpos;
5207 Py_ssize_t endinpos;
5208 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005209 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005210 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005211 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005212 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005213 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005214 PyObject *errorHandler = NULL;
5215 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216
Tim Peters772747b2001-08-09 22:21:55 +00005217 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005218 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219
5220 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005221 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005223 /* Check for BOM marks (U+FEFF) in the input and adjust current
5224 byte order setting accordingly. In native mode, the leading BOM
5225 mark is skipped, in all other modes, it is copied to the output
5226 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005227 if (bo == 0 && size >= 2) {
5228 const Py_UCS4 bom = (q[1] << 8) | q[0];
5229 if (bom == 0xFEFF) {
5230 q += 2;
5231 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005233 else if (bom == 0xFFFE) {
5234 q += 2;
5235 bo = 1;
5236 }
5237 if (byteorder)
5238 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005239 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240
Antoine Pitrou63065d72012-05-15 23:48:04 +02005241 if (q == e) {
5242 if (consumed)
5243 *consumed = size;
5244 Py_INCREF(unicode_empty);
5245 return unicode_empty;
Tim Peters772747b2001-08-09 22:21:55 +00005246 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005247
Antoine Pitrouab868312009-01-10 15:40:25 +00005248#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005249 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005250#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005251 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005252#endif
Tim Peters772747b2001-08-09 22:21:55 +00005253
Antoine Pitrou63065d72012-05-15 23:48:04 +02005254 /* Note: size will always be longer than the resulting Unicode
5255 character count */
5256 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5257 if (!unicode)
5258 return NULL;
5259
5260 outpos = 0;
5261 while (1) {
5262 Py_UCS4 ch = 0;
5263 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005264 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005265 if (kind == PyUnicode_1BYTE_KIND) {
5266 if (PyUnicode_IS_ASCII(unicode))
5267 ch = asciilib_utf16_decode(&q, e,
5268 PyUnicode_1BYTE_DATA(unicode), &outpos,
5269 native_ordering);
5270 else
5271 ch = ucs1lib_utf16_decode(&q, e,
5272 PyUnicode_1BYTE_DATA(unicode), &outpos,
5273 native_ordering);
5274 } else if (kind == PyUnicode_2BYTE_KIND) {
5275 ch = ucs2lib_utf16_decode(&q, e,
5276 PyUnicode_2BYTE_DATA(unicode), &outpos,
5277 native_ordering);
5278 } else {
5279 assert(kind == PyUnicode_4BYTE_KIND);
5280 ch = ucs4lib_utf16_decode(&q, e,
5281 PyUnicode_4BYTE_DATA(unicode), &outpos,
5282 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005283 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005284 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005285
Antoine Pitrou63065d72012-05-15 23:48:04 +02005286 switch (ch)
5287 {
5288 case 0:
5289 /* remaining byte at the end? (size should be even) */
5290 if (q == e || consumed)
5291 goto End;
5292 errmsg = "truncated data";
5293 startinpos = ((const char *)q) - starts;
5294 endinpos = ((const char *)e) - starts;
5295 break;
5296 /* The remaining input chars are ignored if the callback
5297 chooses to skip the input */
5298 case 1:
5299 errmsg = "unexpected end of data";
5300 startinpos = ((const char *)q) - 2 - starts;
5301 endinpos = ((const char *)e) - starts;
5302 break;
5303 case 2:
5304 errmsg = "illegal encoding";
5305 startinpos = ((const char *)q) - 2 - starts;
5306 endinpos = startinpos + 2;
5307 break;
5308 case 3:
5309 errmsg = "illegal UTF-16 surrogate";
5310 startinpos = ((const char *)q) - 4 - starts;
5311 endinpos = startinpos + 2;
5312 break;
5313 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005314 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5315 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005316 continue;
5317 }
5318
Benjamin Peterson29060642009-01-31 22:14:21 +00005319 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005320 errors,
5321 &errorHandler,
5322 "utf16", errmsg,
5323 &starts,
5324 (const char **)&e,
5325 &startinpos,
5326 &endinpos,
5327 &exc,
5328 (const char **)&q,
5329 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005330 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005331 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005332 }
5333
Antoine Pitrou63065d72012-05-15 23:48:04 +02005334End:
Walter Dörwald69652032004-09-07 20:24:22 +00005335 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005336 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005337
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005339 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 goto onError;
5341
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005342 Py_XDECREF(errorHandler);
5343 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005344 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005348 Py_XDECREF(errorHandler);
5349 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350 return NULL;
5351}
5352
Tim Peters772747b2001-08-09 22:21:55 +00005353PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005354_PyUnicode_EncodeUTF16(PyObject *str,
5355 const char *errors,
5356 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005358 int kind;
5359 void *data;
5360 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005361 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005362 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005363 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005364 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005365 /* Offsets from p for storing byte pairs in the right order. */
5366#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5367 int ihi = 1, ilo = 0;
5368#else
5369 int ihi = 0, ilo = 1;
5370#endif
5371
Benjamin Peterson29060642009-01-31 22:14:21 +00005372#define STORECHAR(CH) \
5373 do { \
5374 p[ihi] = ((CH) >> 8) & 0xff; \
5375 p[ilo] = (CH) & 0xff; \
5376 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005377 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005379 if (!PyUnicode_Check(str)) {
5380 PyErr_BadArgument();
5381 return NULL;
5382 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005383 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005384 return NULL;
5385 kind = PyUnicode_KIND(str);
5386 data = PyUnicode_DATA(str);
5387 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005388
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005389 pairs = 0;
5390 if (kind == PyUnicode_4BYTE_KIND)
5391 for (i = 0; i < len; i++)
5392 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5393 pairs++;
5394 /* 2 * (len + pairs + (byteorder == 0)) */
5395 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005396 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005397 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005398 bytesize = nsize * 2;
5399 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005400 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005401 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 if (v == NULL)
5403 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005405 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005408 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005409 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005410
5411 if (byteorder == -1) {
5412 /* force LE */
5413 ihi = 1;
5414 ilo = 0;
5415 }
5416 else if (byteorder == 1) {
5417 /* force BE */
5418 ihi = 0;
5419 ilo = 1;
5420 }
5421
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005422 for (i = 0; i < len; i++) {
5423 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5424 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005426 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5427 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005428 }
Tim Peters772747b2001-08-09 22:21:55 +00005429 STORECHAR(ch);
5430 if (ch2)
5431 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005432 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005433
5434 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005435 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005436#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437}
5438
Alexander Belopolsky40018472011-02-26 01:02:56 +00005439PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005440PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5441 Py_ssize_t size,
5442 const char *errors,
5443 int byteorder)
5444{
5445 PyObject *result;
5446 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5447 if (tmp == NULL)
5448 return NULL;
5449 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5450 Py_DECREF(tmp);
5451 return result;
5452}
5453
5454PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005455PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005457 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458}
5459
5460/* --- Unicode Escape Codec ----------------------------------------------- */
5461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005462/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5463 if all the escapes in the string make it still a valid ASCII string.
5464 Returns -1 if any escapes were found which cause the string to
5465 pop out of ASCII range. Otherwise returns the length of the
5466 required buffer to hold the string.
5467 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005468static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005469length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5470{
5471 const unsigned char *p = (const unsigned char *)s;
5472 const unsigned char *end = p + size;
5473 Py_ssize_t length = 0;
5474
5475 if (size < 0)
5476 return -1;
5477
5478 for (; p < end; ++p) {
5479 if (*p > 127) {
5480 /* Non-ASCII */
5481 return -1;
5482 }
5483 else if (*p != '\\') {
5484 /* Normal character */
5485 ++length;
5486 }
5487 else {
5488 /* Backslash-escape, check next char */
5489 ++p;
5490 /* Escape sequence reaches till end of string or
5491 non-ASCII follow-up. */
5492 if (p >= end || *p > 127)
5493 return -1;
5494 switch (*p) {
5495 case '\n':
5496 /* backslash + \n result in zero characters */
5497 break;
5498 case '\\': case '\'': case '\"':
5499 case 'b': case 'f': case 't':
5500 case 'n': case 'r': case 'v': case 'a':
5501 ++length;
5502 break;
5503 case '0': case '1': case '2': case '3':
5504 case '4': case '5': case '6': case '7':
5505 case 'x': case 'u': case 'U': case 'N':
5506 /* these do not guarantee ASCII characters */
5507 return -1;
5508 default:
5509 /* count the backslash + the other character */
5510 length += 2;
5511 }
5512 }
5513 }
5514 return length;
5515}
5516
Fredrik Lundh06d12682001-01-24 07:59:11 +00005517static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005518
Alexander Belopolsky40018472011-02-26 01:02:56 +00005519PyObject *
5520PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005521 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005522 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005524 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005525 Py_ssize_t startinpos;
5526 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005527 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005528 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005530 char* message;
5531 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005532 PyObject *errorHandler = NULL;
5533 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005534 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005535 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005536
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005537 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005538
5539 /* After length_of_escaped_ascii_string() there are two alternatives,
5540 either the string is pure ASCII with named escapes like \n, etc.
5541 and we determined it's exact size (common case)
5542 or it contains \x, \u, ... escape sequences. then we create a
5543 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005544 if (len >= 0) {
5545 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005546 if (!v)
5547 goto onError;
5548 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005549 }
5550 else {
5551 /* Escaped strings will always be longer than the resulting
5552 Unicode string, so we start with size here and then reduce the
5553 length after conversion to the true value.
5554 (but if the error callback returns a long replacement string
5555 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005556 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557 if (!v)
5558 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005559 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005560 }
5561
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005563 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005564 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005566
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 while (s < end) {
5568 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005569 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005570 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005572 /* The only case in which i == ascii_length is a backslash
5573 followed by a newline. */
5574 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005575
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 /* Non-escape characters are interpreted as Unicode ordinals */
5577 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005578 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5579 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 continue;
5581 }
5582
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005583 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005584 /* \ - Escapes */
5585 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005586 c = *s++;
5587 if (s > end)
5588 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005589
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005590 /* The only case in which i == ascii_length is a backslash
5591 followed by a newline. */
5592 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005593
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005594 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595
Benjamin Peterson29060642009-01-31 22:14:21 +00005596 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005597#define WRITECHAR(ch) \
5598 do { \
5599 if (unicode_putchar(&v, &i, ch) < 0) \
5600 goto onError; \
5601 }while(0)
5602
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005604 case '\\': WRITECHAR('\\'); break;
5605 case '\'': WRITECHAR('\''); break;
5606 case '\"': WRITECHAR('\"'); break;
5607 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005608 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005609 case 'f': WRITECHAR('\014'); break;
5610 case 't': WRITECHAR('\t'); break;
5611 case 'n': WRITECHAR('\n'); break;
5612 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005613 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005614 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005615 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005616 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617
Benjamin Peterson29060642009-01-31 22:14:21 +00005618 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 case '0': case '1': case '2': case '3':
5620 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005621 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005622 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005623 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005624 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005625 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005627 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 break;
5629
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 /* hex escapes */
5631 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005633 digits = 2;
5634 message = "truncated \\xXX escape";
5635 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005639 digits = 4;
5640 message = "truncated \\uXXXX escape";
5641 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642
Benjamin Peterson29060642009-01-31 22:14:21 +00005643 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005644 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005645 digits = 8;
5646 message = "truncated \\UXXXXXXXX escape";
5647 hexescape:
5648 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005649 if (s+digits>end) {
5650 endinpos = size;
5651 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005652 errors, &errorHandler,
5653 "unicodeescape", "end of string in escape sequence",
5654 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005655 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005656 goto onError;
5657 goto nextByte;
5658 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005659 for (j = 0; j < digits; ++j) {
5660 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005661 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005662 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005663 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 errors, &errorHandler,
5665 "unicodeescape", message,
5666 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005667 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005668 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005669 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005670 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005671 }
5672 chr = (chr<<4) & ~0xF;
5673 if (c >= '0' && c <= '9')
5674 chr += c - '0';
5675 else if (c >= 'a' && c <= 'f')
5676 chr += 10 + c - 'a';
5677 else
5678 chr += 10 + c - 'A';
5679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005680 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005681 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005682 /* _decoding_error will have already written into the
5683 target buffer. */
5684 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005685 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005686 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005687 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005688 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005689 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005690 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005691 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 errors, &errorHandler,
5693 "unicodeescape", "illegal Unicode character",
5694 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005695 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005696 goto onError;
5697 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005698 break;
5699
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005701 case 'N':
5702 message = "malformed \\N character escape";
5703 if (ucnhash_CAPI == NULL) {
5704 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005705 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5706 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005707 if (ucnhash_CAPI == NULL)
5708 goto ucnhashError;
5709 }
5710 if (*s == '{') {
5711 const char *start = s+1;
5712 /* look for the closing brace */
5713 while (*s != '}' && s < end)
5714 s++;
5715 if (s > start && s < end && *s == '}') {
5716 /* found a name. look it up in the unicode database */
5717 message = "unknown Unicode character name";
5718 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005719 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005720 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005721 goto store;
5722 }
5723 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005725 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 errors, &errorHandler,
5727 "unicodeescape", message,
5728 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005729 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005730 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005731 break;
5732
5733 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005734 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 message = "\\ at end of string";
5736 s--;
5737 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005738 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 errors, &errorHandler,
5740 "unicodeescape", message,
5741 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005742 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005743 goto onError;
5744 }
5745 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005746 WRITECHAR('\\');
5747 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005748 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005749 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005754#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005755
Victor Stinner16e6a802011-12-12 13:24:15 +01005756 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005757 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005758 Py_XDECREF(errorHandler);
5759 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005760 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005761
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005763 PyErr_SetString(
5764 PyExc_UnicodeError,
5765 "\\N escapes not supported (can't load unicodedata module)"
5766 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005767 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768 Py_XDECREF(errorHandler);
5769 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005770 return NULL;
5771
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 Py_XDECREF(errorHandler);
5775 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776 return NULL;
5777}
5778
5779/* Return a Unicode-Escape string version of the Unicode object.
5780
5781 If quotes is true, the string is enclosed in u"" or u'' quotes as
5782 appropriate.
5783
5784*/
5785
Alexander Belopolsky40018472011-02-26 01:02:56 +00005786PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005787PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005789 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005790 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005792 int kind;
5793 void *data;
5794 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795
Thomas Wouters89f507f2006-12-13 04:49:30 +00005796 /* Initial allocation is based on the longest-possible unichr
5797 escape.
5798
5799 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5800 unichr, so in this case it's the longest unichr escape. In
5801 narrow (UTF-16) builds this is five chars per source unichr
5802 since there are two unichrs in the surrogate pair, so in narrow
5803 (UTF-16) builds it's not the longest unichr escape.
5804
5805 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5806 so in the narrow (UTF-16) build case it's the longest unichr
5807 escape.
5808 */
5809
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005810 if (!PyUnicode_Check(unicode)) {
5811 PyErr_BadArgument();
5812 return NULL;
5813 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005814 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005815 return NULL;
5816 len = PyUnicode_GET_LENGTH(unicode);
5817 kind = PyUnicode_KIND(unicode);
5818 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005819 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005820 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5821 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5822 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5823 }
5824
5825 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005826 return PyBytes_FromStringAndSize(NULL, 0);
5827
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005828 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005830
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005831 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005833 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 if (repr == NULL)
5836 return NULL;
5837
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005838 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005840 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005841 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005842
Walter Dörwald79e913e2007-05-12 11:08:06 +00005843 /* Escape backslashes */
5844 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 *p++ = '\\';
5846 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005847 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005848 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005849
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005850 /* Map 21-bit characters to '\U00xxxxxx' */
5851 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005852 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005853 *p++ = '\\';
5854 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005855 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5856 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5857 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5858 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5859 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5860 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5861 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5862 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005863 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005864 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005865
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005867 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868 *p++ = '\\';
5869 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005870 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5871 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5872 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5873 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005875
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005876 /* Map special whitespace to '\t', \n', '\r' */
5877 else if (ch == '\t') {
5878 *p++ = '\\';
5879 *p++ = 't';
5880 }
5881 else if (ch == '\n') {
5882 *p++ = '\\';
5883 *p++ = 'n';
5884 }
5885 else if (ch == '\r') {
5886 *p++ = '\\';
5887 *p++ = 'r';
5888 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005889
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005890 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005891 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005893 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005894 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5895 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005896 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005897
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 /* Copy everything else as-is */
5899 else
5900 *p++ = (char) ch;
5901 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005903 assert(p - PyBytes_AS_STRING(repr) > 0);
5904 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5905 return NULL;
5906 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907}
5908
Alexander Belopolsky40018472011-02-26 01:02:56 +00005909PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005910PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5911 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005913 PyObject *result;
5914 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5915 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005917 result = PyUnicode_AsUnicodeEscapeString(tmp);
5918 Py_DECREF(tmp);
5919 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920}
5921
5922/* --- Raw Unicode Escape Codec ------------------------------------------- */
5923
Alexander Belopolsky40018472011-02-26 01:02:56 +00005924PyObject *
5925PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005926 Py_ssize_t size,
5927 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005929 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005930 Py_ssize_t startinpos;
5931 Py_ssize_t endinpos;
5932 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005933 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 const char *end;
5935 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005936 PyObject *errorHandler = NULL;
5937 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005938
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939 /* Escaped strings will always be longer than the resulting
5940 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005941 length after conversion to the true value. (But decoding error
5942 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005943 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005945 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005947 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005948 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 end = s + size;
5950 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 unsigned char c;
5952 Py_UCS4 x;
5953 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005954 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955
Benjamin Peterson29060642009-01-31 22:14:21 +00005956 /* Non-escape characters are interpreted as Unicode ordinals */
5957 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005958 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5959 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005961 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005962 startinpos = s-starts;
5963
5964 /* \u-escapes are only interpreted iff the number of leading
5965 backslashes if odd */
5966 bs = s;
5967 for (;s < end;) {
5968 if (*s != '\\')
5969 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005970 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5971 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 }
5973 if (((s - bs) & 1) == 0 ||
5974 s >= end ||
5975 (*s != 'u' && *s != 'U')) {
5976 continue;
5977 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005978 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005979 count = *s=='u' ? 4 : 8;
5980 s++;
5981
5982 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 for (x = 0, i = 0; i < count; ++i, ++s) {
5984 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005985 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 endinpos = s-starts;
5987 if (unicode_decode_call_errorhandler(
5988 errors, &errorHandler,
5989 "rawunicodeescape", "truncated \\uXXXX",
5990 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005991 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 goto onError;
5993 goto nextByte;
5994 }
5995 x = (x<<4) & ~0xF;
5996 if (c >= '0' && c <= '9')
5997 x += c - '0';
5998 else if (c >= 'a' && c <= 'f')
5999 x += 10 + c - 'a';
6000 else
6001 x += 10 + c - 'A';
6002 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006003 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006004 if (unicode_putchar(&v, &outpos, x) < 0)
6005 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006006 } else {
6007 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006008 if (unicode_decode_call_errorhandler(
6009 errors, &errorHandler,
6010 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006012 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006014 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006015 nextByte:
6016 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006018 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006020 Py_XDECREF(errorHandler);
6021 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006022 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006023
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006026 Py_XDECREF(errorHandler);
6027 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 return NULL;
6029}
6030
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006031
Alexander Belopolsky40018472011-02-26 01:02:56 +00006032PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006033PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006035 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036 char *p;
6037 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006038 Py_ssize_t expandsize, pos;
6039 int kind;
6040 void *data;
6041 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006043 if (!PyUnicode_Check(unicode)) {
6044 PyErr_BadArgument();
6045 return NULL;
6046 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006047 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006048 return NULL;
6049 kind = PyUnicode_KIND(unicode);
6050 data = PyUnicode_DATA(unicode);
6051 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006052 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6053 bytes, and 1 byte characters 4. */
6054 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006055
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006056 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006058
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006059 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060 if (repr == NULL)
6061 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006062 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006063 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006065 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006066 for (pos = 0; pos < len; pos++) {
6067 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 /* Map 32-bit characters to '\Uxxxxxxxx' */
6069 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006070 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006071 *p++ = '\\';
6072 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006073 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6074 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6075 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6076 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6077 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6078 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6079 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6080 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006081 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006083 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 *p++ = '\\';
6085 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006086 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6087 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6088 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6089 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 /* Copy everything else as-is */
6092 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 *p++ = (char) ch;
6094 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006095
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006096 assert(p > q);
6097 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006098 return NULL;
6099 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100}
6101
Alexander Belopolsky40018472011-02-26 01:02:56 +00006102PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006103PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6104 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006106 PyObject *result;
6107 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6108 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006109 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006110 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6111 Py_DECREF(tmp);
6112 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113}
6114
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006115/* --- Unicode Internal Codec ------------------------------------------- */
6116
Alexander Belopolsky40018472011-02-26 01:02:56 +00006117PyObject *
6118_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006119 Py_ssize_t size,
6120 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006121{
6122 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006123 Py_ssize_t startinpos;
6124 Py_ssize_t endinpos;
6125 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006126 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006127 const char *end;
6128 const char *reason;
6129 PyObject *errorHandler = NULL;
6130 PyObject *exc = NULL;
6131
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006132 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006133 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006134 1))
6135 return NULL;
6136
Thomas Wouters89f507f2006-12-13 04:49:30 +00006137 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006138 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006139 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006141 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006142 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006143 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006144 end = s + size;
6145
6146 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006147 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006148 Py_UCS4 ch;
6149 /* We copy the raw representation one byte at a time because the
6150 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006151 ((char *) &uch)[0] = s[0];
6152 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006153#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006154 ((char *) &uch)[2] = s[2];
6155 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006156#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006157 ch = uch;
6158
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006159 /* We have to sanity check the raw data, otherwise doom looms for
6160 some malformed UCS-4 data. */
6161 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006162#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006163 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006164#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006165 end-s < Py_UNICODE_SIZE
6166 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006168 startinpos = s - starts;
6169 if (end-s < Py_UNICODE_SIZE) {
6170 endinpos = end-starts;
6171 reason = "truncated input";
6172 }
6173 else {
6174 endinpos = s - starts + Py_UNICODE_SIZE;
6175 reason = "illegal code point (> 0x10FFFF)";
6176 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006177 if (unicode_decode_call_errorhandler(
6178 errors, &errorHandler,
6179 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006180 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006181 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006182 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006183 continue;
6184 }
6185
6186 s += Py_UNICODE_SIZE;
6187#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006188 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006189 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006190 Py_UNICODE uch2;
6191 ((char *) &uch2)[0] = s[0];
6192 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006193 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006194 {
Victor Stinner551ac952011-11-29 22:58:13 +01006195 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006196 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006197 }
6198 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006199#endif
6200
6201 if (unicode_putchar(&v, &outpos, ch) < 0)
6202 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006203 }
6204
Victor Stinner16e6a802011-12-12 13:24:15 +01006205 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006206 goto onError;
6207 Py_XDECREF(errorHandler);
6208 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006209 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006210
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006212 Py_XDECREF(v);
6213 Py_XDECREF(errorHandler);
6214 Py_XDECREF(exc);
6215 return NULL;
6216}
6217
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218/* --- Latin-1 Codec ------------------------------------------------------ */
6219
Alexander Belopolsky40018472011-02-26 01:02:56 +00006220PyObject *
6221PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006222 Py_ssize_t size,
6223 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006226 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227}
6228
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006229/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006230static void
6231make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006232 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006233 PyObject *unicode,
6234 Py_ssize_t startpos, Py_ssize_t endpos,
6235 const char *reason)
6236{
6237 if (*exceptionObject == NULL) {
6238 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006239 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006240 encoding, unicode, startpos, endpos, reason);
6241 }
6242 else {
6243 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6244 goto onError;
6245 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6246 goto onError;
6247 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6248 goto onError;
6249 return;
6250 onError:
6251 Py_DECREF(*exceptionObject);
6252 *exceptionObject = NULL;
6253 }
6254}
6255
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006256/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006257static void
6258raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006259 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006260 PyObject *unicode,
6261 Py_ssize_t startpos, Py_ssize_t endpos,
6262 const char *reason)
6263{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006264 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006265 encoding, unicode, startpos, endpos, reason);
6266 if (*exceptionObject != NULL)
6267 PyCodec_StrictErrors(*exceptionObject);
6268}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006269
6270/* error handling callback helper:
6271 build arguments, call the callback and check the arguments,
6272 put the result into newpos and return the replacement string, which
6273 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006274static PyObject *
6275unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006276 PyObject **errorHandler,
6277 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006278 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006279 Py_ssize_t startpos, Py_ssize_t endpos,
6280 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006281{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006282 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006283 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006284 PyObject *restuple;
6285 PyObject *resunicode;
6286
6287 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006289 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006291 }
6292
Benjamin Petersonbac79492012-01-14 13:34:47 -05006293 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006294 return NULL;
6295 len = PyUnicode_GET_LENGTH(unicode);
6296
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006297 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006298 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006299 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006300 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006301
6302 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006303 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006304 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006305 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006306 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006307 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 Py_DECREF(restuple);
6309 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006310 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006311 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006312 &resunicode, newpos)) {
6313 Py_DECREF(restuple);
6314 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006315 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006316 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6317 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6318 Py_DECREF(restuple);
6319 return NULL;
6320 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006321 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006322 *newpos = len + *newpos;
6323 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006324 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6325 Py_DECREF(restuple);
6326 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006327 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006328 Py_INCREF(resunicode);
6329 Py_DECREF(restuple);
6330 return resunicode;
6331}
6332
Alexander Belopolsky40018472011-02-26 01:02:56 +00006333static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006334unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006335 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006336 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006337{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006338 /* input state */
6339 Py_ssize_t pos=0, size;
6340 int kind;
6341 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006342 /* output object */
6343 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006344 /* pointer into the output */
6345 char *str;
6346 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006347 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006348 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6349 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006350 PyObject *errorHandler = NULL;
6351 PyObject *exc = NULL;
6352 /* the following variable is used for caching string comparisons
6353 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6354 int known_errorHandler = -1;
6355
Benjamin Petersonbac79492012-01-14 13:34:47 -05006356 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006357 return NULL;
6358 size = PyUnicode_GET_LENGTH(unicode);
6359 kind = PyUnicode_KIND(unicode);
6360 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006361 /* allocate enough for a simple encoding without
6362 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006363 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006364 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006365 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006366 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006367 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006368 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006369 ressize = size;
6370
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006371 while (pos < size) {
6372 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006373
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 /* can we encode this? */
6375 if (c<limit) {
6376 /* no overflow check, because we know that the space is enough */
6377 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006378 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006379 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006381 Py_ssize_t requiredsize;
6382 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006383 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006385 Py_ssize_t collstart = pos;
6386 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006387 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006388 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 ++collend;
6390 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6391 if (known_errorHandler==-1) {
6392 if ((errors==NULL) || (!strcmp(errors, "strict")))
6393 known_errorHandler = 1;
6394 else if (!strcmp(errors, "replace"))
6395 known_errorHandler = 2;
6396 else if (!strcmp(errors, "ignore"))
6397 known_errorHandler = 3;
6398 else if (!strcmp(errors, "xmlcharrefreplace"))
6399 known_errorHandler = 4;
6400 else
6401 known_errorHandler = 0;
6402 }
6403 switch (known_errorHandler) {
6404 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006405 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 goto onError;
6407 case 2: /* replace */
6408 while (collstart++<collend)
6409 *str++ = '?'; /* fall through */
6410 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006411 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 break;
6413 case 4: /* xmlcharrefreplace */
6414 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006415 /* determine replacement size */
6416 for (i = collstart, repsize = 0; i < collend; ++i) {
6417 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6418 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006419 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006420 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006422 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006423 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006424 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006425 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006426 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006427 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006428 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006430 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006431 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006433 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006434 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006435 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 if (requiredsize > ressize) {
6437 if (requiredsize<2*ressize)
6438 requiredsize = 2*ressize;
6439 if (_PyBytes_Resize(&res, requiredsize))
6440 goto onError;
6441 str = PyBytes_AS_STRING(res) + respos;
6442 ressize = requiredsize;
6443 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006444 /* generate replacement */
6445 for (i = collstart; i < collend; ++i) {
6446 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006448 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 break;
6450 default:
6451 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006452 encoding, reason, unicode, &exc,
6453 collstart, collend, &newpos);
6454 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006455 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006457 if (PyBytes_Check(repunicode)) {
6458 /* Directly copy bytes result to output. */
6459 repsize = PyBytes_Size(repunicode);
6460 if (repsize > 1) {
6461 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006462 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006463 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6464 Py_DECREF(repunicode);
6465 goto onError;
6466 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006467 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006468 ressize += repsize-1;
6469 }
6470 memcpy(str, PyBytes_AsString(repunicode), repsize);
6471 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006472 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006473 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006474 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006475 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006476 /* need more space? (at least enough for what we
6477 have+the replacement+the rest of the string, so
6478 we won't have to check space for encodable characters) */
6479 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006480 repsize = PyUnicode_GET_LENGTH(repunicode);
6481 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 if (requiredsize > ressize) {
6483 if (requiredsize<2*ressize)
6484 requiredsize = 2*ressize;
6485 if (_PyBytes_Resize(&res, requiredsize)) {
6486 Py_DECREF(repunicode);
6487 goto onError;
6488 }
6489 str = PyBytes_AS_STRING(res) + respos;
6490 ressize = requiredsize;
6491 }
6492 /* check if there is anything unencodable in the replacement
6493 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006494 for (i = 0; repsize-->0; ++i, ++str) {
6495 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006497 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006498 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 Py_DECREF(repunicode);
6500 goto onError;
6501 }
6502 *str = (char)c;
6503 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006504 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006505 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006506 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006507 }
6508 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006509 /* Resize if we allocated to much */
6510 size = str - PyBytes_AS_STRING(res);
6511 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006512 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006513 if (_PyBytes_Resize(&res, size) < 0)
6514 goto onError;
6515 }
6516
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006517 Py_XDECREF(errorHandler);
6518 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006519 return res;
6520
6521 onError:
6522 Py_XDECREF(res);
6523 Py_XDECREF(errorHandler);
6524 Py_XDECREF(exc);
6525 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006526}
6527
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006528/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006529PyObject *
6530PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006531 Py_ssize_t size,
6532 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006533{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006534 PyObject *result;
6535 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6536 if (unicode == NULL)
6537 return NULL;
6538 result = unicode_encode_ucs1(unicode, errors, 256);
6539 Py_DECREF(unicode);
6540 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541}
6542
Alexander Belopolsky40018472011-02-26 01:02:56 +00006543PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006544_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006545{
6546 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006547 PyErr_BadArgument();
6548 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006550 if (PyUnicode_READY(unicode) == -1)
6551 return NULL;
6552 /* Fast path: if it is a one-byte string, construct
6553 bytes object directly. */
6554 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6555 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6556 PyUnicode_GET_LENGTH(unicode));
6557 /* Non-Latin-1 characters present. Defer to above function to
6558 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006559 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006560}
6561
6562PyObject*
6563PyUnicode_AsLatin1String(PyObject *unicode)
6564{
6565 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566}
6567
6568/* --- 7-bit ASCII Codec -------------------------------------------------- */
6569
Alexander Belopolsky40018472011-02-26 01:02:56 +00006570PyObject *
6571PyUnicode_DecodeASCII(const char *s,
6572 Py_ssize_t size,
6573 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006575 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006576 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006577 int kind;
6578 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006579 Py_ssize_t startinpos;
6580 Py_ssize_t endinpos;
6581 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006582 const char *e;
6583 PyObject *errorHandler = NULL;
6584 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006585
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006586 if (size == 0) {
6587 Py_INCREF(unicode_empty);
6588 return unicode_empty;
6589 }
6590
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006592 if (size == 1 && (unsigned char)s[0] < 128)
6593 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006594
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006595 unicode = PyUnicode_New(size, 127);
6596 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006598
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006599 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006600 data = PyUnicode_1BYTE_DATA(unicode);
6601 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6602 if (outpos == size)
6603 return unicode;
6604
6605 s += outpos;
6606 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006607 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006608 register unsigned char c = (unsigned char)*s;
6609 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006610 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 ++s;
6612 }
6613 else {
6614 startinpos = s-starts;
6615 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 if (unicode_decode_call_errorhandler(
6617 errors, &errorHandler,
6618 "ascii", "ordinal not in range(128)",
6619 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006620 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006622 kind = PyUnicode_KIND(unicode);
6623 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006626 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006627 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006628 Py_XDECREF(errorHandler);
6629 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006630 assert(_PyUnicode_CheckConsistency(unicode, 1));
6631 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006632
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006634 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006635 Py_XDECREF(errorHandler);
6636 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 return NULL;
6638}
6639
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006640/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006641PyObject *
6642PyUnicode_EncodeASCII(const Py_UNICODE *p,
6643 Py_ssize_t size,
6644 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006646 PyObject *result;
6647 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6648 if (unicode == NULL)
6649 return NULL;
6650 result = unicode_encode_ucs1(unicode, errors, 128);
6651 Py_DECREF(unicode);
6652 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653}
6654
Alexander Belopolsky40018472011-02-26 01:02:56 +00006655PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006656_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006657{
6658 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006659 PyErr_BadArgument();
6660 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006662 if (PyUnicode_READY(unicode) == -1)
6663 return NULL;
6664 /* Fast path: if it is an ASCII-only string, construct bytes object
6665 directly. Else defer to above function to raise the exception. */
6666 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6667 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6668 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006669 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006670}
6671
6672PyObject *
6673PyUnicode_AsASCIIString(PyObject *unicode)
6674{
6675 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676}
6677
Victor Stinner99b95382011-07-04 14:23:54 +02006678#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006679
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006680/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006681
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006682#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006683#define NEED_RETRY
6684#endif
6685
Victor Stinner3a50e702011-10-18 21:21:00 +02006686#ifndef WC_ERR_INVALID_CHARS
6687# define WC_ERR_INVALID_CHARS 0x0080
6688#endif
6689
6690static char*
6691code_page_name(UINT code_page, PyObject **obj)
6692{
6693 *obj = NULL;
6694 if (code_page == CP_ACP)
6695 return "mbcs";
6696 if (code_page == CP_UTF7)
6697 return "CP_UTF7";
6698 if (code_page == CP_UTF8)
6699 return "CP_UTF8";
6700
6701 *obj = PyBytes_FromFormat("cp%u", code_page);
6702 if (*obj == NULL)
6703 return NULL;
6704 return PyBytes_AS_STRING(*obj);
6705}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006706
Alexander Belopolsky40018472011-02-26 01:02:56 +00006707static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006708is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006709{
6710 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006711 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006712
Victor Stinner3a50e702011-10-18 21:21:00 +02006713 if (!IsDBCSLeadByteEx(code_page, *curr))
6714 return 0;
6715
6716 prev = CharPrevExA(code_page, s, curr, 0);
6717 if (prev == curr)
6718 return 1;
6719 /* FIXME: This code is limited to "true" double-byte encodings,
6720 as it assumes an incomplete character consists of a single
6721 byte. */
6722 if (curr - prev == 2)
6723 return 1;
6724 if (!IsDBCSLeadByteEx(code_page, *prev))
6725 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006726 return 0;
6727}
6728
Victor Stinner3a50e702011-10-18 21:21:00 +02006729static DWORD
6730decode_code_page_flags(UINT code_page)
6731{
6732 if (code_page == CP_UTF7) {
6733 /* The CP_UTF7 decoder only supports flags=0 */
6734 return 0;
6735 }
6736 else
6737 return MB_ERR_INVALID_CHARS;
6738}
6739
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006740/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006741 * Decode a byte string from a Windows code page into unicode object in strict
6742 * mode.
6743 *
6744 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6745 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006746 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006747static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006748decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006749 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006750 const char *in,
6751 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006752{
Victor Stinner3a50e702011-10-18 21:21:00 +02006753 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006754 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006755 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006756
6757 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006758 assert(insize > 0);
6759 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6760 if (outsize <= 0)
6761 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006762
6763 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006764 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006765 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006766 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 if (*v == NULL)
6768 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006769 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006770 }
6771 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006773 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006774 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006775 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006776 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006777 }
6778
6779 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006780 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6781 if (outsize <= 0)
6782 goto error;
6783 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006784
Victor Stinner3a50e702011-10-18 21:21:00 +02006785error:
6786 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6787 return -2;
6788 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006789 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006790}
6791
Victor Stinner3a50e702011-10-18 21:21:00 +02006792/*
6793 * Decode a byte string from a code page into unicode object with an error
6794 * handler.
6795 *
6796 * Returns consumed size if succeed, or raise a WindowsError or
6797 * UnicodeDecodeError exception and returns -1 on error.
6798 */
6799static int
6800decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006801 PyObject **v,
6802 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006803 const char *errors)
6804{
6805 const char *startin = in;
6806 const char *endin = in + size;
6807 const DWORD flags = decode_code_page_flags(code_page);
6808 /* Ideally, we should get reason from FormatMessage. This is the Windows
6809 2000 English version of the message. */
6810 const char *reason = "No mapping for the Unicode character exists "
6811 "in the target code page.";
6812 /* each step cannot decode more than 1 character, but a character can be
6813 represented as a surrogate pair */
6814 wchar_t buffer[2], *startout, *out;
6815 int insize, outsize;
6816 PyObject *errorHandler = NULL;
6817 PyObject *exc = NULL;
6818 PyObject *encoding_obj = NULL;
6819 char *encoding;
6820 DWORD err;
6821 int ret = -1;
6822
6823 assert(size > 0);
6824
6825 encoding = code_page_name(code_page, &encoding_obj);
6826 if (encoding == NULL)
6827 return -1;
6828
6829 if (errors == NULL || strcmp(errors, "strict") == 0) {
6830 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6831 UnicodeDecodeError. */
6832 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6833 if (exc != NULL) {
6834 PyCodec_StrictErrors(exc);
6835 Py_CLEAR(exc);
6836 }
6837 goto error;
6838 }
6839
6840 if (*v == NULL) {
6841 /* Create unicode object */
6842 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6843 PyErr_NoMemory();
6844 goto error;
6845 }
Victor Stinnerab595942011-12-17 04:59:06 +01006846 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006847 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006848 if (*v == NULL)
6849 goto error;
6850 startout = PyUnicode_AS_UNICODE(*v);
6851 }
6852 else {
6853 /* Extend unicode object */
6854 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6855 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6856 PyErr_NoMemory();
6857 goto error;
6858 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006859 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006860 goto error;
6861 startout = PyUnicode_AS_UNICODE(*v) + n;
6862 }
6863
6864 /* Decode the byte string character per character */
6865 out = startout;
6866 while (in < endin)
6867 {
6868 /* Decode a character */
6869 insize = 1;
6870 do
6871 {
6872 outsize = MultiByteToWideChar(code_page, flags,
6873 in, insize,
6874 buffer, Py_ARRAY_LENGTH(buffer));
6875 if (outsize > 0)
6876 break;
6877 err = GetLastError();
6878 if (err != ERROR_NO_UNICODE_TRANSLATION
6879 && err != ERROR_INSUFFICIENT_BUFFER)
6880 {
6881 PyErr_SetFromWindowsErr(0);
6882 goto error;
6883 }
6884 insize++;
6885 }
6886 /* 4=maximum length of a UTF-8 sequence */
6887 while (insize <= 4 && (in + insize) <= endin);
6888
6889 if (outsize <= 0) {
6890 Py_ssize_t startinpos, endinpos, outpos;
6891
6892 startinpos = in - startin;
6893 endinpos = startinpos + 1;
6894 outpos = out - PyUnicode_AS_UNICODE(*v);
6895 if (unicode_decode_call_errorhandler(
6896 errors, &errorHandler,
6897 encoding, reason,
6898 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006899 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006900 {
6901 goto error;
6902 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006903 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006904 }
6905 else {
6906 in += insize;
6907 memcpy(out, buffer, outsize * sizeof(wchar_t));
6908 out += outsize;
6909 }
6910 }
6911
6912 /* write a NUL character at the end */
6913 *out = 0;
6914
6915 /* Extend unicode object */
6916 outsize = out - startout;
6917 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006918 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006919 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006920 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006921
6922error:
6923 Py_XDECREF(encoding_obj);
6924 Py_XDECREF(errorHandler);
6925 Py_XDECREF(exc);
6926 return ret;
6927}
6928
Victor Stinner3a50e702011-10-18 21:21:00 +02006929static PyObject *
6930decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006931 const char *s, Py_ssize_t size,
6932 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006933{
Victor Stinner76a31a62011-11-04 00:05:13 +01006934 PyObject *v = NULL;
6935 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006936
Victor Stinner3a50e702011-10-18 21:21:00 +02006937 if (code_page < 0) {
6938 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6939 return NULL;
6940 }
6941
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006942 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006943 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006944
Victor Stinner76a31a62011-11-04 00:05:13 +01006945 do
6946 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006947#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006948 if (size > INT_MAX) {
6949 chunk_size = INT_MAX;
6950 final = 0;
6951 done = 0;
6952 }
6953 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006954#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006955 {
6956 chunk_size = (int)size;
6957 final = (consumed == NULL);
6958 done = 1;
6959 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006960
Victor Stinner76a31a62011-11-04 00:05:13 +01006961 /* Skip trailing lead-byte unless 'final' is set */
6962 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6963 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006964
Victor Stinner76a31a62011-11-04 00:05:13 +01006965 if (chunk_size == 0 && done) {
6966 if (v != NULL)
6967 break;
6968 Py_INCREF(unicode_empty);
6969 return unicode_empty;
6970 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006971
Victor Stinner76a31a62011-11-04 00:05:13 +01006972
6973 converted = decode_code_page_strict(code_page, &v,
6974 s, chunk_size);
6975 if (converted == -2)
6976 converted = decode_code_page_errors(code_page, &v,
6977 s, chunk_size,
6978 errors);
6979 assert(converted != 0);
6980
6981 if (converted < 0) {
6982 Py_XDECREF(v);
6983 return NULL;
6984 }
6985
6986 if (consumed)
6987 *consumed += converted;
6988
6989 s += converted;
6990 size -= converted;
6991 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006992
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006993 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006994}
6995
Alexander Belopolsky40018472011-02-26 01:02:56 +00006996PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006997PyUnicode_DecodeCodePageStateful(int code_page,
6998 const char *s,
6999 Py_ssize_t size,
7000 const char *errors,
7001 Py_ssize_t *consumed)
7002{
7003 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7004}
7005
7006PyObject *
7007PyUnicode_DecodeMBCSStateful(const char *s,
7008 Py_ssize_t size,
7009 const char *errors,
7010 Py_ssize_t *consumed)
7011{
7012 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7013}
7014
7015PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007016PyUnicode_DecodeMBCS(const char *s,
7017 Py_ssize_t size,
7018 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007019{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007020 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7021}
7022
Victor Stinner3a50e702011-10-18 21:21:00 +02007023static DWORD
7024encode_code_page_flags(UINT code_page, const char *errors)
7025{
7026 if (code_page == CP_UTF8) {
7027 if (winver.dwMajorVersion >= 6)
7028 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7029 and later */
7030 return WC_ERR_INVALID_CHARS;
7031 else
7032 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7033 return 0;
7034 }
7035 else if (code_page == CP_UTF7) {
7036 /* CP_UTF7 only supports flags=0 */
7037 return 0;
7038 }
7039 else {
7040 if (errors != NULL && strcmp(errors, "replace") == 0)
7041 return 0;
7042 else
7043 return WC_NO_BEST_FIT_CHARS;
7044 }
7045}
7046
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007047/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007048 * Encode a Unicode string to a Windows code page into a byte string in strict
7049 * mode.
7050 *
7051 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7052 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007053 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007054static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007055encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007056 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007057 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007058{
Victor Stinner554f3f02010-06-16 23:33:54 +00007059 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007060 BOOL *pusedDefaultChar = &usedDefaultChar;
7061 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007062 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007063 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007064 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007065 const DWORD flags = encode_code_page_flags(code_page, NULL);
7066 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007067 /* Create a substring so that we can get the UTF-16 representation
7068 of just the slice under consideration. */
7069 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007070
Martin v. Löwis3d325192011-11-04 18:23:06 +01007071 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007072
Victor Stinner3a50e702011-10-18 21:21:00 +02007073 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007074 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007075 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007076 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007077
Victor Stinner2fc507f2011-11-04 20:06:39 +01007078 substring = PyUnicode_Substring(unicode, offset, offset+len);
7079 if (substring == NULL)
7080 return -1;
7081 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7082 if (p == NULL) {
7083 Py_DECREF(substring);
7084 return -1;
7085 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007086
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007087 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007088 outsize = WideCharToMultiByte(code_page, flags,
7089 p, size,
7090 NULL, 0,
7091 NULL, pusedDefaultChar);
7092 if (outsize <= 0)
7093 goto error;
7094 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007095 if (pusedDefaultChar && *pusedDefaultChar) {
7096 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007097 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007098 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007099
Victor Stinner3a50e702011-10-18 21:21:00 +02007100 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007101 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007102 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007103 if (*outbytes == NULL) {
7104 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007106 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007107 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007108 }
7109 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007110 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007111 const Py_ssize_t n = PyBytes_Size(*outbytes);
7112 if (outsize > PY_SSIZE_T_MAX - n) {
7113 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007114 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007116 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007117 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7118 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007119 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007120 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007122 }
7123
7124 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007125 outsize = WideCharToMultiByte(code_page, flags,
7126 p, size,
7127 out, outsize,
7128 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007129 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007130 if (outsize <= 0)
7131 goto error;
7132 if (pusedDefaultChar && *pusedDefaultChar)
7133 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007134 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007135
Victor Stinner3a50e702011-10-18 21:21:00 +02007136error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007137 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007138 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7139 return -2;
7140 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007141 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007142}
7143
Victor Stinner3a50e702011-10-18 21:21:00 +02007144/*
7145 * Encode a Unicode string to a Windows code page into a byte string using a
7146 * error handler.
7147 *
7148 * Returns consumed characters if succeed, or raise a WindowsError and returns
7149 * -1 on other error.
7150 */
7151static int
7152encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007153 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007154 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007155{
Victor Stinner3a50e702011-10-18 21:21:00 +02007156 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007157 Py_ssize_t pos = unicode_offset;
7158 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007159 /* Ideally, we should get reason from FormatMessage. This is the Windows
7160 2000 English version of the message. */
7161 const char *reason = "invalid character";
7162 /* 4=maximum length of a UTF-8 sequence */
7163 char buffer[4];
7164 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7165 Py_ssize_t outsize;
7166 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007167 PyObject *errorHandler = NULL;
7168 PyObject *exc = NULL;
7169 PyObject *encoding_obj = NULL;
7170 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007171 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007172 PyObject *rep;
7173 int ret = -1;
7174
7175 assert(insize > 0);
7176
7177 encoding = code_page_name(code_page, &encoding_obj);
7178 if (encoding == NULL)
7179 return -1;
7180
7181 if (errors == NULL || strcmp(errors, "strict") == 0) {
7182 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7183 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007184 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007185 if (exc != NULL) {
7186 PyCodec_StrictErrors(exc);
7187 Py_DECREF(exc);
7188 }
7189 Py_XDECREF(encoding_obj);
7190 return -1;
7191 }
7192
7193 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7194 pusedDefaultChar = &usedDefaultChar;
7195 else
7196 pusedDefaultChar = NULL;
7197
7198 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7199 PyErr_NoMemory();
7200 goto error;
7201 }
7202 outsize = insize * Py_ARRAY_LENGTH(buffer);
7203
7204 if (*outbytes == NULL) {
7205 /* Create string object */
7206 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7207 if (*outbytes == NULL)
7208 goto error;
7209 out = PyBytes_AS_STRING(*outbytes);
7210 }
7211 else {
7212 /* Extend string object */
7213 Py_ssize_t n = PyBytes_Size(*outbytes);
7214 if (n > PY_SSIZE_T_MAX - outsize) {
7215 PyErr_NoMemory();
7216 goto error;
7217 }
7218 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7219 goto error;
7220 out = PyBytes_AS_STRING(*outbytes) + n;
7221 }
7222
7223 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007224 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007225 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007226 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7227 wchar_t chars[2];
7228 int charsize;
7229 if (ch < 0x10000) {
7230 chars[0] = (wchar_t)ch;
7231 charsize = 1;
7232 }
7233 else {
7234 ch -= 0x10000;
7235 chars[0] = 0xd800 + (ch >> 10);
7236 chars[1] = 0xdc00 + (ch & 0x3ff);
7237 charsize = 2;
7238 }
7239
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007241 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007242 buffer, Py_ARRAY_LENGTH(buffer),
7243 NULL, pusedDefaultChar);
7244 if (outsize > 0) {
7245 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7246 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007247 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007248 memcpy(out, buffer, outsize);
7249 out += outsize;
7250 continue;
7251 }
7252 }
7253 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7254 PyErr_SetFromWindowsErr(0);
7255 goto error;
7256 }
7257
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 rep = unicode_encode_call_errorhandler(
7259 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007260 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007261 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007262 if (rep == NULL)
7263 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007264 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007265
7266 if (PyBytes_Check(rep)) {
7267 outsize = PyBytes_GET_SIZE(rep);
7268 if (outsize != 1) {
7269 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7270 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7271 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7272 Py_DECREF(rep);
7273 goto error;
7274 }
7275 out = PyBytes_AS_STRING(*outbytes) + offset;
7276 }
7277 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7278 out += outsize;
7279 }
7280 else {
7281 Py_ssize_t i;
7282 enum PyUnicode_Kind kind;
7283 void *data;
7284
Benjamin Petersonbac79492012-01-14 13:34:47 -05007285 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007286 Py_DECREF(rep);
7287 goto error;
7288 }
7289
7290 outsize = PyUnicode_GET_LENGTH(rep);
7291 if (outsize != 1) {
7292 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7293 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7294 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7295 Py_DECREF(rep);
7296 goto error;
7297 }
7298 out = PyBytes_AS_STRING(*outbytes) + offset;
7299 }
7300 kind = PyUnicode_KIND(rep);
7301 data = PyUnicode_DATA(rep);
7302 for (i=0; i < outsize; i++) {
7303 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7304 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007305 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007306 encoding, unicode,
7307 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007308 "unable to encode error handler result to ASCII");
7309 Py_DECREF(rep);
7310 goto error;
7311 }
7312 *out = (unsigned char)ch;
7313 out++;
7314 }
7315 }
7316 Py_DECREF(rep);
7317 }
7318 /* write a NUL byte */
7319 *out = 0;
7320 outsize = out - PyBytes_AS_STRING(*outbytes);
7321 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7322 if (_PyBytes_Resize(outbytes, outsize) < 0)
7323 goto error;
7324 ret = 0;
7325
7326error:
7327 Py_XDECREF(encoding_obj);
7328 Py_XDECREF(errorHandler);
7329 Py_XDECREF(exc);
7330 return ret;
7331}
7332
Victor Stinner3a50e702011-10-18 21:21:00 +02007333static PyObject *
7334encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007335 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007336 const char *errors)
7337{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007338 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007339 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007340 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007341 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007342
Benjamin Petersonbac79492012-01-14 13:34:47 -05007343 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007344 return NULL;
7345 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007346
Victor Stinner3a50e702011-10-18 21:21:00 +02007347 if (code_page < 0) {
7348 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7349 return NULL;
7350 }
7351
Martin v. Löwis3d325192011-11-04 18:23:06 +01007352 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007353 return PyBytes_FromStringAndSize(NULL, 0);
7354
Victor Stinner7581cef2011-11-03 22:32:33 +01007355 offset = 0;
7356 do
7357 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007358#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007359 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007360 chunks. */
7361 if (len > INT_MAX/2) {
7362 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007363 done = 0;
7364 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007365 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007366#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007367 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007368 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007369 done = 1;
7370 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007371
Victor Stinner76a31a62011-11-04 00:05:13 +01007372 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007373 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007374 errors);
7375 if (ret == -2)
7376 ret = encode_code_page_errors(code_page, &outbytes,
7377 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007378 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007379 if (ret < 0) {
7380 Py_XDECREF(outbytes);
7381 return NULL;
7382 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007383
Victor Stinner7581cef2011-11-03 22:32:33 +01007384 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007385 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007386 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007387
Victor Stinner3a50e702011-10-18 21:21:00 +02007388 return outbytes;
7389}
7390
7391PyObject *
7392PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7393 Py_ssize_t size,
7394 const char *errors)
7395{
Victor Stinner7581cef2011-11-03 22:32:33 +01007396 PyObject *unicode, *res;
7397 unicode = PyUnicode_FromUnicode(p, size);
7398 if (unicode == NULL)
7399 return NULL;
7400 res = encode_code_page(CP_ACP, unicode, errors);
7401 Py_DECREF(unicode);
7402 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007403}
7404
7405PyObject *
7406PyUnicode_EncodeCodePage(int code_page,
7407 PyObject *unicode,
7408 const char *errors)
7409{
Victor Stinner7581cef2011-11-03 22:32:33 +01007410 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007411}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007412
Alexander Belopolsky40018472011-02-26 01:02:56 +00007413PyObject *
7414PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007415{
7416 if (!PyUnicode_Check(unicode)) {
7417 PyErr_BadArgument();
7418 return NULL;
7419 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007420 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007421}
7422
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007423#undef NEED_RETRY
7424
Victor Stinner99b95382011-07-04 14:23:54 +02007425#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007426
Guido van Rossumd57fd912000-03-10 22:53:23 +00007427/* --- Character Mapping Codec -------------------------------------------- */
7428
Alexander Belopolsky40018472011-02-26 01:02:56 +00007429PyObject *
7430PyUnicode_DecodeCharmap(const char *s,
7431 Py_ssize_t size,
7432 PyObject *mapping,
7433 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007435 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007436 Py_ssize_t startinpos;
7437 Py_ssize_t endinpos;
7438 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007439 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007440 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007441 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007442 PyObject *errorHandler = NULL;
7443 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007444
Guido van Rossumd57fd912000-03-10 22:53:23 +00007445 /* Default to Latin-1 */
7446 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007447 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007448
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007449 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007450 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007452 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007453 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007454 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007455 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007456 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007457 Py_ssize_t maplen;
7458 enum PyUnicode_Kind kind;
7459 void *data;
7460 Py_UCS4 x;
7461
Benjamin Petersonbac79492012-01-14 13:34:47 -05007462 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007463 return NULL;
7464
7465 maplen = PyUnicode_GET_LENGTH(mapping);
7466 data = PyUnicode_DATA(mapping);
7467 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007468 while (s < e) {
7469 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007472 x = PyUnicode_READ(kind, data, ch);
7473 else
7474 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007476 if (x == 0xfffe)
7477 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007478 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007479 startinpos = s-starts;
7480 endinpos = startinpos+1;
7481 if (unicode_decode_call_errorhandler(
7482 errors, &errorHandler,
7483 "charmap", "character maps to <undefined>",
7484 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007485 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007486 goto onError;
7487 }
7488 continue;
7489 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007490
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007491 if (unicode_putchar(&v, &outpos, x) < 0)
7492 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007493 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007494 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007495 }
7496 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007497 while (s < e) {
7498 unsigned char ch = *s;
7499 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007500
Benjamin Peterson29060642009-01-31 22:14:21 +00007501 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7502 w = PyLong_FromLong((long)ch);
7503 if (w == NULL)
7504 goto onError;
7505 x = PyObject_GetItem(mapping, w);
7506 Py_DECREF(w);
7507 if (x == NULL) {
7508 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7509 /* No mapping found means: mapping is undefined. */
7510 PyErr_Clear();
7511 x = Py_None;
7512 Py_INCREF(x);
7513 } else
7514 goto onError;
7515 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007516
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 /* Apply mapping */
7518 if (PyLong_Check(x)) {
7519 long value = PyLong_AS_LONG(x);
7520 if (value < 0 || value > 65535) {
7521 PyErr_SetString(PyExc_TypeError,
7522 "character mapping must be in range(65536)");
7523 Py_DECREF(x);
7524 goto onError;
7525 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007526 if (unicode_putchar(&v, &outpos, value) < 0)
7527 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 }
7529 else if (x == Py_None) {
7530 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 startinpos = s-starts;
7532 endinpos = startinpos+1;
7533 if (unicode_decode_call_errorhandler(
7534 errors, &errorHandler,
7535 "charmap", "character maps to <undefined>",
7536 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007537 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 Py_DECREF(x);
7539 goto onError;
7540 }
7541 Py_DECREF(x);
7542 continue;
7543 }
7544 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007545 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007546
Benjamin Petersonbac79492012-01-14 13:34:47 -05007547 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007548 goto onError;
7549 targetsize = PyUnicode_GET_LENGTH(x);
7550
7551 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007552 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007553 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007554 PyUnicode_READ_CHAR(x, 0)) < 0)
7555 goto onError;
7556 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007557 else if (targetsize > 1) {
7558 /* 1-n mapping */
7559 if (targetsize > extrachars) {
7560 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007561 Py_ssize_t needed = (targetsize - extrachars) + \
7562 (targetsize << 2);
7563 extrachars += needed;
7564 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007565 if (unicode_resize(&v,
7566 PyUnicode_GET_LENGTH(v) + needed) < 0)
7567 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007568 Py_DECREF(x);
7569 goto onError;
7570 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007571 }
Victor Stinner1b487b42012-05-03 12:29:04 +02007572 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007573 goto onError;
7574 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7575 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007576 extrachars -= targetsize;
7577 }
7578 /* 1-0 mapping: skip the character */
7579 }
7580 else {
7581 /* wrong return value */
7582 PyErr_SetString(PyExc_TypeError,
7583 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007584 Py_DECREF(x);
7585 goto onError;
7586 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007587 Py_DECREF(x);
7588 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007589 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007590 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007591 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007592 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007593 Py_XDECREF(errorHandler);
7594 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007595 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007596
Benjamin Peterson29060642009-01-31 22:14:21 +00007597 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007598 Py_XDECREF(errorHandler);
7599 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 Py_XDECREF(v);
7601 return NULL;
7602}
7603
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007604/* Charmap encoding: the lookup table */
7605
Alexander Belopolsky40018472011-02-26 01:02:56 +00007606struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 PyObject_HEAD
7608 unsigned char level1[32];
7609 int count2, count3;
7610 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007611};
7612
7613static PyObject*
7614encoding_map_size(PyObject *obj, PyObject* args)
7615{
7616 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007617 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007619}
7620
7621static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007622 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007623 PyDoc_STR("Return the size (in bytes) of this object") },
7624 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007625};
7626
7627static void
7628encoding_map_dealloc(PyObject* o)
7629{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007630 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007631}
7632
7633static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007634 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007635 "EncodingMap", /*tp_name*/
7636 sizeof(struct encoding_map), /*tp_basicsize*/
7637 0, /*tp_itemsize*/
7638 /* methods */
7639 encoding_map_dealloc, /*tp_dealloc*/
7640 0, /*tp_print*/
7641 0, /*tp_getattr*/
7642 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007643 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007644 0, /*tp_repr*/
7645 0, /*tp_as_number*/
7646 0, /*tp_as_sequence*/
7647 0, /*tp_as_mapping*/
7648 0, /*tp_hash*/
7649 0, /*tp_call*/
7650 0, /*tp_str*/
7651 0, /*tp_getattro*/
7652 0, /*tp_setattro*/
7653 0, /*tp_as_buffer*/
7654 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7655 0, /*tp_doc*/
7656 0, /*tp_traverse*/
7657 0, /*tp_clear*/
7658 0, /*tp_richcompare*/
7659 0, /*tp_weaklistoffset*/
7660 0, /*tp_iter*/
7661 0, /*tp_iternext*/
7662 encoding_map_methods, /*tp_methods*/
7663 0, /*tp_members*/
7664 0, /*tp_getset*/
7665 0, /*tp_base*/
7666 0, /*tp_dict*/
7667 0, /*tp_descr_get*/
7668 0, /*tp_descr_set*/
7669 0, /*tp_dictoffset*/
7670 0, /*tp_init*/
7671 0, /*tp_alloc*/
7672 0, /*tp_new*/
7673 0, /*tp_free*/
7674 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007675};
7676
7677PyObject*
7678PyUnicode_BuildEncodingMap(PyObject* string)
7679{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007680 PyObject *result;
7681 struct encoding_map *mresult;
7682 int i;
7683 int need_dict = 0;
7684 unsigned char level1[32];
7685 unsigned char level2[512];
7686 unsigned char *mlevel1, *mlevel2, *mlevel3;
7687 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007688 int kind;
7689 void *data;
7690 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007692 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007693 PyErr_BadArgument();
7694 return NULL;
7695 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007696 kind = PyUnicode_KIND(string);
7697 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007698 memset(level1, 0xFF, sizeof level1);
7699 memset(level2, 0xFF, sizeof level2);
7700
7701 /* If there isn't a one-to-one mapping of NULL to \0,
7702 or if there are non-BMP characters, we need to use
7703 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007704 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007705 need_dict = 1;
7706 for (i = 1; i < 256; i++) {
7707 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007708 ch = PyUnicode_READ(kind, data, i);
7709 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007710 need_dict = 1;
7711 break;
7712 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007713 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007714 /* unmapped character */
7715 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007716 l1 = ch >> 11;
7717 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007718 if (level1[l1] == 0xFF)
7719 level1[l1] = count2++;
7720 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007721 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007722 }
7723
7724 if (count2 >= 0xFF || count3 >= 0xFF)
7725 need_dict = 1;
7726
7727 if (need_dict) {
7728 PyObject *result = PyDict_New();
7729 PyObject *key, *value;
7730 if (!result)
7731 return NULL;
7732 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007733 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007734 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007735 if (!key || !value)
7736 goto failed1;
7737 if (PyDict_SetItem(result, key, value) == -1)
7738 goto failed1;
7739 Py_DECREF(key);
7740 Py_DECREF(value);
7741 }
7742 return result;
7743 failed1:
7744 Py_XDECREF(key);
7745 Py_XDECREF(value);
7746 Py_DECREF(result);
7747 return NULL;
7748 }
7749
7750 /* Create a three-level trie */
7751 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7752 16*count2 + 128*count3 - 1);
7753 if (!result)
7754 return PyErr_NoMemory();
7755 PyObject_Init(result, &EncodingMapType);
7756 mresult = (struct encoding_map*)result;
7757 mresult->count2 = count2;
7758 mresult->count3 = count3;
7759 mlevel1 = mresult->level1;
7760 mlevel2 = mresult->level23;
7761 mlevel3 = mresult->level23 + 16*count2;
7762 memcpy(mlevel1, level1, 32);
7763 memset(mlevel2, 0xFF, 16*count2);
7764 memset(mlevel3, 0, 128*count3);
7765 count3 = 0;
7766 for (i = 1; i < 256; i++) {
7767 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007768 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007769 /* unmapped character */
7770 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007771 o1 = PyUnicode_READ(kind, data, i)>>11;
7772 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007773 i2 = 16*mlevel1[o1] + o2;
7774 if (mlevel2[i2] == 0xFF)
7775 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007776 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007777 i3 = 128*mlevel2[i2] + o3;
7778 mlevel3[i3] = i;
7779 }
7780 return result;
7781}
7782
7783static int
Victor Stinner22168992011-11-20 17:09:18 +01007784encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007785{
7786 struct encoding_map *map = (struct encoding_map*)mapping;
7787 int l1 = c>>11;
7788 int l2 = (c>>7) & 0xF;
7789 int l3 = c & 0x7F;
7790 int i;
7791
Victor Stinner22168992011-11-20 17:09:18 +01007792 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007794 if (c == 0)
7795 return 0;
7796 /* level 1*/
7797 i = map->level1[l1];
7798 if (i == 0xFF) {
7799 return -1;
7800 }
7801 /* level 2*/
7802 i = map->level23[16*i+l2];
7803 if (i == 0xFF) {
7804 return -1;
7805 }
7806 /* level 3 */
7807 i = map->level23[16*map->count2 + 128*i + l3];
7808 if (i == 0) {
7809 return -1;
7810 }
7811 return i;
7812}
7813
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007814/* Lookup the character ch in the mapping. If the character
7815 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007816 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007817static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007818charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819{
Christian Heimes217cfd12007-12-02 14:31:20 +00007820 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007821 PyObject *x;
7822
7823 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007825 x = PyObject_GetItem(mapping, w);
7826 Py_DECREF(w);
7827 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7829 /* No mapping found means: mapping is undefined. */
7830 PyErr_Clear();
7831 x = Py_None;
7832 Py_INCREF(x);
7833 return x;
7834 } else
7835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007837 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007839 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 long value = PyLong_AS_LONG(x);
7841 if (value < 0 || value > 255) {
7842 PyErr_SetString(PyExc_TypeError,
7843 "character mapping must be in range(256)");
7844 Py_DECREF(x);
7845 return NULL;
7846 }
7847 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007849 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 /* wrong return value */
7853 PyErr_Format(PyExc_TypeError,
7854 "character mapping must return integer, bytes or None, not %.400s",
7855 x->ob_type->tp_name);
7856 Py_DECREF(x);
7857 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858 }
7859}
7860
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007861static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007862charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007863{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007864 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7865 /* exponentially overallocate to minimize reallocations */
7866 if (requiredsize < 2*outsize)
7867 requiredsize = 2*outsize;
7868 if (_PyBytes_Resize(outobj, requiredsize))
7869 return -1;
7870 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007871}
7872
Benjamin Peterson14339b62009-01-31 16:36:08 +00007873typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007875} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007876/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007877 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007878 space is available. Return a new reference to the object that
7879 was put in the output buffer, or Py_None, if the mapping was undefined
7880 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007881 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007882static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007883charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007884 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007885{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007886 PyObject *rep;
7887 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007888 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007889
Christian Heimes90aa7642007-12-19 02:45:37 +00007890 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007891 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007893 if (res == -1)
7894 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 if (outsize<requiredsize)
7896 if (charmapencode_resize(outobj, outpos, requiredsize))
7897 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007898 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 outstart[(*outpos)++] = (char)res;
7900 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007901 }
7902
7903 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007904 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007906 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 Py_DECREF(rep);
7908 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007909 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 if (PyLong_Check(rep)) {
7911 Py_ssize_t requiredsize = *outpos+1;
7912 if (outsize<requiredsize)
7913 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7914 Py_DECREF(rep);
7915 return enc_EXCEPTION;
7916 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007917 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007919 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 else {
7921 const char *repchars = PyBytes_AS_STRING(rep);
7922 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7923 Py_ssize_t requiredsize = *outpos+repsize;
7924 if (outsize<requiredsize)
7925 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7926 Py_DECREF(rep);
7927 return enc_EXCEPTION;
7928 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007929 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 memcpy(outstart + *outpos, repchars, repsize);
7931 *outpos += repsize;
7932 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007933 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007934 Py_DECREF(rep);
7935 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007936}
7937
7938/* handle an error in PyUnicode_EncodeCharmap
7939 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007940static int
7941charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007942 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007943 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007944 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007945 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007946{
7947 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007948 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007949 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007950 enum PyUnicode_Kind kind;
7951 void *data;
7952 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007953 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007954 Py_ssize_t collstartpos = *inpos;
7955 Py_ssize_t collendpos = *inpos+1;
7956 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007957 char *encoding = "charmap";
7958 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007959 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007960 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007961 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007962
Benjamin Petersonbac79492012-01-14 13:34:47 -05007963 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007964 return -1;
7965 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007966 /* find all unencodable characters */
7967 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007968 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007969 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007970 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007971 val = encoding_map_lookup(ch, mapping);
7972 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 break;
7974 ++collendpos;
7975 continue;
7976 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007977
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007978 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7979 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 if (rep==NULL)
7981 return -1;
7982 else if (rep!=Py_None) {
7983 Py_DECREF(rep);
7984 break;
7985 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007986 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007988 }
7989 /* cache callback name lookup
7990 * (if not done yet, i.e. it's the first error) */
7991 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 if ((errors==NULL) || (!strcmp(errors, "strict")))
7993 *known_errorHandler = 1;
7994 else if (!strcmp(errors, "replace"))
7995 *known_errorHandler = 2;
7996 else if (!strcmp(errors, "ignore"))
7997 *known_errorHandler = 3;
7998 else if (!strcmp(errors, "xmlcharrefreplace"))
7999 *known_errorHandler = 4;
8000 else
8001 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008002 }
8003 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008004 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008005 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008006 return -1;
8007 case 2: /* replace */
8008 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 x = charmapencode_output('?', mapping, res, respos);
8010 if (x==enc_EXCEPTION) {
8011 return -1;
8012 }
8013 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008014 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 return -1;
8016 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008017 }
8018 /* fall through */
8019 case 3: /* ignore */
8020 *inpos = collendpos;
8021 break;
8022 case 4: /* xmlcharrefreplace */
8023 /* generate replacement (temporarily (mis)uses p) */
8024 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 char buffer[2+29+1+1];
8026 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008027 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 for (cp = buffer; *cp; ++cp) {
8029 x = charmapencode_output(*cp, mapping, res, respos);
8030 if (x==enc_EXCEPTION)
8031 return -1;
8032 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008033 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 return -1;
8035 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008036 }
8037 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008038 *inpos = collendpos;
8039 break;
8040 default:
8041 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008042 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008044 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008046 if (PyBytes_Check(repunicode)) {
8047 /* Directly copy bytes result to output. */
8048 Py_ssize_t outsize = PyBytes_Size(*res);
8049 Py_ssize_t requiredsize;
8050 repsize = PyBytes_Size(repunicode);
8051 requiredsize = *respos + repsize;
8052 if (requiredsize > outsize)
8053 /* Make room for all additional bytes. */
8054 if (charmapencode_resize(res, respos, requiredsize)) {
8055 Py_DECREF(repunicode);
8056 return -1;
8057 }
8058 memcpy(PyBytes_AsString(*res) + *respos,
8059 PyBytes_AsString(repunicode), repsize);
8060 *respos += repsize;
8061 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008062 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008063 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008064 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008065 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008066 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008067 Py_DECREF(repunicode);
8068 return -1;
8069 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008070 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008071 data = PyUnicode_DATA(repunicode);
8072 kind = PyUnicode_KIND(repunicode);
8073 for (index = 0; index < repsize; index++) {
8074 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8075 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008076 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008077 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 return -1;
8079 }
8080 else if (x==enc_FAILED) {
8081 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008082 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 return -1;
8084 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008085 }
8086 *inpos = newpos;
8087 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088 }
8089 return 0;
8090}
8091
Alexander Belopolsky40018472011-02-26 01:02:56 +00008092PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008093_PyUnicode_EncodeCharmap(PyObject *unicode,
8094 PyObject *mapping,
8095 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097 /* output object */
8098 PyObject *res = NULL;
8099 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008100 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008101 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008102 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008103 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104 PyObject *errorHandler = NULL;
8105 PyObject *exc = NULL;
8106 /* the following variable is used for caching string comparisons
8107 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8108 * 3=ignore, 4=xmlcharrefreplace */
8109 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110
Benjamin Petersonbac79492012-01-14 13:34:47 -05008111 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008112 return NULL;
8113 size = PyUnicode_GET_LENGTH(unicode);
8114
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115 /* Default to Latin-1 */
8116 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008117 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008119 /* allocate enough for a simple encoding without
8120 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008121 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008122 if (res == NULL)
8123 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008124 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008127 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008128 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008130 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 if (x==enc_EXCEPTION) /* error */
8132 goto onError;
8133 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008134 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 &exc,
8136 &known_errorHandler, &errorHandler, errors,
8137 &res, &respos)) {
8138 goto onError;
8139 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008140 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 else
8142 /* done with this character => adjust input position */
8143 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008146 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008147 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008148 if (_PyBytes_Resize(&res, respos) < 0)
8149 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008151 Py_XDECREF(exc);
8152 Py_XDECREF(errorHandler);
8153 return res;
8154
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008156 Py_XDECREF(res);
8157 Py_XDECREF(exc);
8158 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159 return NULL;
8160}
8161
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008162/* Deprecated */
8163PyObject *
8164PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8165 Py_ssize_t size,
8166 PyObject *mapping,
8167 const char *errors)
8168{
8169 PyObject *result;
8170 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8171 if (unicode == NULL)
8172 return NULL;
8173 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8174 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008175 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008176}
8177
Alexander Belopolsky40018472011-02-26 01:02:56 +00008178PyObject *
8179PyUnicode_AsCharmapString(PyObject *unicode,
8180 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181{
8182 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 PyErr_BadArgument();
8184 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008186 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187}
8188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008189/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008190static void
8191make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008192 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008193 Py_ssize_t startpos, Py_ssize_t endpos,
8194 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008196 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008197 *exceptionObject = _PyUnicodeTranslateError_Create(
8198 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199 }
8200 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8202 goto onError;
8203 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8204 goto onError;
8205 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8206 goto onError;
8207 return;
8208 onError:
8209 Py_DECREF(*exceptionObject);
8210 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211 }
8212}
8213
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008214/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008215static void
8216raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008217 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008218 Py_ssize_t startpos, Py_ssize_t endpos,
8219 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008220{
8221 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008222 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008223 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008225}
8226
8227/* error handling callback helper:
8228 build arguments, call the callback and check the arguments,
8229 put the result into newpos and return the replacement string, which
8230 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008231static PyObject *
8232unicode_translate_call_errorhandler(const char *errors,
8233 PyObject **errorHandler,
8234 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008236 Py_ssize_t startpos, Py_ssize_t endpos,
8237 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008238{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008239 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008241 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008242 PyObject *restuple;
8243 PyObject *resunicode;
8244
8245 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008247 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249 }
8250
8251 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008252 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008253 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008255
8256 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008261 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 Py_DECREF(restuple);
8263 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 }
8265 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 &resunicode, &i_newpos)) {
8267 Py_DECREF(restuple);
8268 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008270 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008271 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008272 else
8273 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008274 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8276 Py_DECREF(restuple);
8277 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008278 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008279 Py_INCREF(resunicode);
8280 Py_DECREF(restuple);
8281 return resunicode;
8282}
8283
8284/* Lookup the character ch in the mapping and put the result in result,
8285 which must be decrefed by the caller.
8286 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008287static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008288charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008289{
Christian Heimes217cfd12007-12-02 14:31:20 +00008290 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008291 PyObject *x;
8292
8293 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008295 x = PyObject_GetItem(mapping, w);
8296 Py_DECREF(w);
8297 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8299 /* No mapping found means: use 1:1 mapping. */
8300 PyErr_Clear();
8301 *result = NULL;
8302 return 0;
8303 } else
8304 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 }
8306 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 *result = x;
8308 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008309 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008310 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 long value = PyLong_AS_LONG(x);
8312 long max = PyUnicode_GetMax();
8313 if (value < 0 || value > max) {
8314 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008315 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 Py_DECREF(x);
8317 return -1;
8318 }
8319 *result = x;
8320 return 0;
8321 }
8322 else if (PyUnicode_Check(x)) {
8323 *result = x;
8324 return 0;
8325 }
8326 else {
8327 /* wrong return value */
8328 PyErr_SetString(PyExc_TypeError,
8329 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008330 Py_DECREF(x);
8331 return -1;
8332 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333}
8334/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 if not reallocate and adjust various state variables.
8336 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008337static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008338charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008340{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008341 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008342 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008343 /* exponentially overallocate to minimize reallocations */
8344 if (requiredsize < 2 * oldsize)
8345 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008346 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8347 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008349 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350 }
8351 return 0;
8352}
8353/* lookup the character, put the result in the output string and adjust
8354 various state variables. Return a new reference to the object that
8355 was put in the output buffer in *result, or Py_None, if the mapping was
8356 undefined (in which case no character was written).
8357 The called must decref result.
8358 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008359static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008360charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8361 PyObject *mapping, Py_UCS4 **output,
8362 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008363 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008364{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008365 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8366 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008370 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371 }
8372 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008374 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377 }
8378 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008379 Py_ssize_t repsize;
8380 if (PyUnicode_READY(*res) == -1)
8381 return -1;
8382 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 if (repsize==1) {
8384 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008385 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 }
8387 else if (repsize!=0) {
8388 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008389 Py_ssize_t requiredsize = *opos +
8390 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008392 Py_ssize_t i;
8393 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008395 for(i = 0; i < repsize; i++)
8396 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008398 }
8399 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401 return 0;
8402}
8403
Alexander Belopolsky40018472011-02-26 01:02:56 +00008404PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008405_PyUnicode_TranslateCharmap(PyObject *input,
8406 PyObject *mapping,
8407 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008408{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008409 /* input object */
8410 char *idata;
8411 Py_ssize_t size, i;
8412 int kind;
8413 /* output buffer */
8414 Py_UCS4 *output = NULL;
8415 Py_ssize_t osize;
8416 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008418 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419 char *reason = "character maps to <undefined>";
8420 PyObject *errorHandler = NULL;
8421 PyObject *exc = NULL;
8422 /* the following variable is used for caching string comparisons
8423 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8424 * 3=ignore, 4=xmlcharrefreplace */
8425 int known_errorHandler = -1;
8426
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 PyErr_BadArgument();
8429 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008430 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008432 if (PyUnicode_READY(input) == -1)
8433 return NULL;
8434 idata = (char*)PyUnicode_DATA(input);
8435 kind = PyUnicode_KIND(input);
8436 size = PyUnicode_GET_LENGTH(input);
8437 i = 0;
8438
8439 if (size == 0) {
8440 Py_INCREF(input);
8441 return input;
8442 }
8443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444 /* allocate enough for a simple 1:1 translation without
8445 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008446 osize = size;
8447 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8448 opos = 0;
8449 if (output == NULL) {
8450 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 /* try to encode it */
8456 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008457 if (charmaptranslate_output(input, i, mapping,
8458 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 Py_XDECREF(x);
8460 goto onError;
8461 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008462 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 else { /* untranslatable character */
8466 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8467 Py_ssize_t repsize;
8468 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008469 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008470 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 Py_ssize_t collstart = i;
8472 Py_ssize_t collend = i+1;
8473 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008474
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008476 while (collend < size) {
8477 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 goto onError;
8479 Py_XDECREF(x);
8480 if (x!=Py_None)
8481 break;
8482 ++collend;
8483 }
8484 /* cache callback name lookup
8485 * (if not done yet, i.e. it's the first error) */
8486 if (known_errorHandler==-1) {
8487 if ((errors==NULL) || (!strcmp(errors, "strict")))
8488 known_errorHandler = 1;
8489 else if (!strcmp(errors, "replace"))
8490 known_errorHandler = 2;
8491 else if (!strcmp(errors, "ignore"))
8492 known_errorHandler = 3;
8493 else if (!strcmp(errors, "xmlcharrefreplace"))
8494 known_errorHandler = 4;
8495 else
8496 known_errorHandler = 0;
8497 }
8498 switch (known_errorHandler) {
8499 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008500 raise_translate_exception(&exc, input, collstart,
8501 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008502 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 case 2: /* replace */
8504 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 for (coll = collstart; coll<collend; coll++)
8506 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 /* fall through */
8508 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 break;
8511 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512 /* generate replacement (temporarily (mis)uses i) */
8513 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 char buffer[2+29+1+1];
8515 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008516 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8517 if (charmaptranslate_makespace(&output, &osize,
8518 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 goto onError;
8520 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008523 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 break;
8525 default:
8526 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 reason, input, &exc,
8528 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008529 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008531 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008532 Py_DECREF(repunicode);
8533 goto onError;
8534 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008536 repsize = PyUnicode_GET_LENGTH(repunicode);
8537 if (charmaptranslate_makespace(&output, &osize,
8538 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 Py_DECREF(repunicode);
8540 goto onError;
8541 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008542 for (uni2 = 0; repsize-->0; ++uni2)
8543 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8544 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008546 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008547 }
8548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008549 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8550 if (!res)
8551 goto onError;
8552 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008553 Py_XDECREF(exc);
8554 Py_XDECREF(errorHandler);
8555 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008556
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008558 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008559 Py_XDECREF(exc);
8560 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561 return NULL;
8562}
8563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008564/* Deprecated. Use PyUnicode_Translate instead. */
8565PyObject *
8566PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8567 Py_ssize_t size,
8568 PyObject *mapping,
8569 const char *errors)
8570{
8571 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8572 if (!unicode)
8573 return NULL;
8574 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8575}
8576
Alexander Belopolsky40018472011-02-26 01:02:56 +00008577PyObject *
8578PyUnicode_Translate(PyObject *str,
8579 PyObject *mapping,
8580 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581{
8582 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008583
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584 str = PyUnicode_FromObject(str);
8585 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008587 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588 Py_DECREF(str);
8589 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008590
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592 Py_XDECREF(str);
8593 return NULL;
8594}
Tim Petersced69f82003-09-16 20:30:58 +00008595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008597fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008598{
8599 /* No need to call PyUnicode_READY(self) because this function is only
8600 called as a callback from fixup() which does it already. */
8601 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8602 const int kind = PyUnicode_KIND(self);
8603 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008604 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008605 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606 Py_ssize_t i;
8607
8608 for (i = 0; i < len; ++i) {
8609 ch = PyUnicode_READ(kind, data, i);
8610 fixed = 0;
8611 if (ch > 127) {
8612 if (Py_UNICODE_ISSPACE(ch))
8613 fixed = ' ';
8614 else {
8615 const int decimal = Py_UNICODE_TODECIMAL(ch);
8616 if (decimal >= 0)
8617 fixed = '0' + decimal;
8618 }
8619 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008620 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008621 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622 PyUnicode_WRITE(kind, data, i, fixed);
8623 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008624 else
8625 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 }
8628
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008629 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630}
8631
8632PyObject *
8633_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8634{
8635 if (!PyUnicode_Check(unicode)) {
8636 PyErr_BadInternalCall();
8637 return NULL;
8638 }
8639 if (PyUnicode_READY(unicode) == -1)
8640 return NULL;
8641 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8642 /* If the string is already ASCII, just return the same string */
8643 Py_INCREF(unicode);
8644 return unicode;
8645 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008646 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647}
8648
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008649PyObject *
8650PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8651 Py_ssize_t length)
8652{
Victor Stinnerf0124502011-11-21 23:12:56 +01008653 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008654 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008655 Py_UCS4 maxchar;
8656 enum PyUnicode_Kind kind;
8657 void *data;
8658
Victor Stinner99d7ad02012-02-22 13:37:39 +01008659 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008660 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008661 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008662 if (ch > 127) {
8663 int decimal = Py_UNICODE_TODECIMAL(ch);
8664 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008665 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008666 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008667 }
8668 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008669
8670 /* Copy to a new string */
8671 decimal = PyUnicode_New(length, maxchar);
8672 if (decimal == NULL)
8673 return decimal;
8674 kind = PyUnicode_KIND(decimal);
8675 data = PyUnicode_DATA(decimal);
8676 /* Iterate over code points */
8677 for (i = 0; i < length; i++) {
8678 Py_UNICODE ch = s[i];
8679 if (ch > 127) {
8680 int decimal = Py_UNICODE_TODECIMAL(ch);
8681 if (decimal >= 0)
8682 ch = '0' + decimal;
8683 }
8684 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008685 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008686 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008687}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008688/* --- Decimal Encoder ---------------------------------------------------- */
8689
Alexander Belopolsky40018472011-02-26 01:02:56 +00008690int
8691PyUnicode_EncodeDecimal(Py_UNICODE *s,
8692 Py_ssize_t length,
8693 char *output,
8694 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008695{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008696 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008697 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008698 enum PyUnicode_Kind kind;
8699 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008700
8701 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 PyErr_BadArgument();
8703 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008704 }
8705
Victor Stinner42bf7752011-11-21 22:52:58 +01008706 unicode = PyUnicode_FromUnicode(s, length);
8707 if (unicode == NULL)
8708 return -1;
8709
Benjamin Petersonbac79492012-01-14 13:34:47 -05008710 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008711 Py_DECREF(unicode);
8712 return -1;
8713 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008714 kind = PyUnicode_KIND(unicode);
8715 data = PyUnicode_DATA(unicode);
8716
Victor Stinnerb84d7232011-11-22 01:50:07 +01008717 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008718 PyObject *exc;
8719 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008721 Py_ssize_t startpos;
8722
8723 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008724
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008726 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008727 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008729 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 decimal = Py_UNICODE_TODECIMAL(ch);
8731 if (decimal >= 0) {
8732 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008733 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 continue;
8735 }
8736 if (0 < ch && ch < 256) {
8737 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008738 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 continue;
8740 }
Victor Stinner6345be92011-11-25 20:09:01 +01008741
Victor Stinner42bf7752011-11-21 22:52:58 +01008742 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008743 exc = NULL;
8744 raise_encode_exception(&exc, "decimal", unicode,
8745 startpos, startpos+1,
8746 "invalid decimal Unicode string");
8747 Py_XDECREF(exc);
8748 Py_DECREF(unicode);
8749 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008750 }
8751 /* 0-terminate the output string */
8752 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008753 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008754 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008755}
8756
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757/* --- Helpers ------------------------------------------------------------ */
8758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008760any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008761 Py_ssize_t start,
8762 Py_ssize_t end)
8763{
8764 int kind1, kind2, kind;
8765 void *buf1, *buf2;
8766 Py_ssize_t len1, len2, result;
8767
8768 kind1 = PyUnicode_KIND(s1);
8769 kind2 = PyUnicode_KIND(s2);
8770 kind = kind1 > kind2 ? kind1 : kind2;
8771 buf1 = PyUnicode_DATA(s1);
8772 buf2 = PyUnicode_DATA(s2);
8773 if (kind1 != kind)
8774 buf1 = _PyUnicode_AsKind(s1, kind);
8775 if (!buf1)
8776 return -2;
8777 if (kind2 != kind)
8778 buf2 = _PyUnicode_AsKind(s2, kind);
8779 if (!buf2) {
8780 if (kind1 != kind) PyMem_Free(buf1);
8781 return -2;
8782 }
8783 len1 = PyUnicode_GET_LENGTH(s1);
8784 len2 = PyUnicode_GET_LENGTH(s2);
8785
Victor Stinner794d5672011-10-10 03:21:36 +02008786 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008787 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008788 case PyUnicode_1BYTE_KIND:
8789 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8790 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8791 else
8792 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8793 break;
8794 case PyUnicode_2BYTE_KIND:
8795 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8796 break;
8797 case PyUnicode_4BYTE_KIND:
8798 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8799 break;
8800 default:
8801 assert(0); result = -2;
8802 }
8803 }
8804 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008805 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008806 case PyUnicode_1BYTE_KIND:
8807 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8808 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8809 else
8810 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8811 break;
8812 case PyUnicode_2BYTE_KIND:
8813 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8814 break;
8815 case PyUnicode_4BYTE_KIND:
8816 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8817 break;
8818 default:
8819 assert(0); result = -2;
8820 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008821 }
8822
8823 if (kind1 != kind)
8824 PyMem_Free(buf1);
8825 if (kind2 != kind)
8826 PyMem_Free(buf2);
8827
8828 return result;
8829}
8830
8831Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008832_PyUnicode_InsertThousandsGrouping(
8833 PyObject *unicode, Py_ssize_t index,
8834 Py_ssize_t n_buffer,
8835 void *digits, Py_ssize_t n_digits,
8836 Py_ssize_t min_width,
8837 const char *grouping, PyObject *thousands_sep,
8838 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008839{
Victor Stinner41a863c2012-02-24 00:37:51 +01008840 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008841 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008842 Py_ssize_t thousands_sep_len;
8843 Py_ssize_t len;
8844
8845 if (unicode != NULL) {
8846 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008847 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008848 }
8849 else {
8850 kind = PyUnicode_1BYTE_KIND;
8851 data = NULL;
8852 }
8853 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8854 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8855 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8856 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008857 if (thousands_sep_kind < kind) {
8858 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8859 if (!thousands_sep_data)
8860 return -1;
8861 }
8862 else {
8863 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8864 if (!data)
8865 return -1;
8866 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008867 }
8868
Benjamin Petersonead6b532011-12-20 17:23:42 -06008869 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008870 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008871 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008872 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008873 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008874 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008875 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008876 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008877 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008878 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008879 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008880 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008881 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008882 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008883 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008884 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008885 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008886 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008887 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008889 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008890 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008891 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008892 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008893 break;
8894 default:
8895 assert(0);
8896 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008897 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008898 if (unicode != NULL && thousands_sep_kind != kind) {
8899 if (thousands_sep_kind < kind)
8900 PyMem_Free(thousands_sep_data);
8901 else
8902 PyMem_Free(data);
8903 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008904 if (unicode == NULL) {
8905 *maxchar = 127;
8906 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008907 *maxchar = MAX_MAXCHAR(*maxchar,
8908 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008909 }
8910 }
8911 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008912}
8913
8914
Thomas Wouters477c8d52006-05-27 19:21:47 +00008915/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008916#define ADJUST_INDICES(start, end, len) \
8917 if (end > len) \
8918 end = len; \
8919 else if (end < 0) { \
8920 end += len; \
8921 if (end < 0) \
8922 end = 0; \
8923 } \
8924 if (start < 0) { \
8925 start += len; \
8926 if (start < 0) \
8927 start = 0; \
8928 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008929
Alexander Belopolsky40018472011-02-26 01:02:56 +00008930Py_ssize_t
8931PyUnicode_Count(PyObject *str,
8932 PyObject *substr,
8933 Py_ssize_t start,
8934 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008936 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008937 PyObject* str_obj;
8938 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008939 int kind1, kind2, kind;
8940 void *buf1 = NULL, *buf2 = NULL;
8941 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008942
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008943 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008944 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008946 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008947 if (!sub_obj) {
8948 Py_DECREF(str_obj);
8949 return -1;
8950 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008951 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008952 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008953 Py_DECREF(str_obj);
8954 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955 }
Tim Petersced69f82003-09-16 20:30:58 +00008956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008957 kind1 = PyUnicode_KIND(str_obj);
8958 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008959 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008962 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008963 if (kind2 > kind) {
8964 Py_DECREF(sub_obj);
8965 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008966 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008967 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008968 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008969 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970 if (!buf2)
8971 goto onError;
8972 len1 = PyUnicode_GET_LENGTH(str_obj);
8973 len2 = PyUnicode_GET_LENGTH(sub_obj);
8974
8975 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008976 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008978 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8979 result = asciilib_count(
8980 ((Py_UCS1*)buf1) + start, end - start,
8981 buf2, len2, PY_SSIZE_T_MAX
8982 );
8983 else
8984 result = ucs1lib_count(
8985 ((Py_UCS1*)buf1) + start, end - start,
8986 buf2, len2, PY_SSIZE_T_MAX
8987 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008988 break;
8989 case PyUnicode_2BYTE_KIND:
8990 result = ucs2lib_count(
8991 ((Py_UCS2*)buf1) + start, end - start,
8992 buf2, len2, PY_SSIZE_T_MAX
8993 );
8994 break;
8995 case PyUnicode_4BYTE_KIND:
8996 result = ucs4lib_count(
8997 ((Py_UCS4*)buf1) + start, end - start,
8998 buf2, len2, PY_SSIZE_T_MAX
8999 );
9000 break;
9001 default:
9002 assert(0); result = 0;
9003 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009004
9005 Py_DECREF(sub_obj);
9006 Py_DECREF(str_obj);
9007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008 if (kind2 != kind)
9009 PyMem_Free(buf2);
9010
Guido van Rossumd57fd912000-03-10 22:53:23 +00009011 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009012 onError:
9013 Py_DECREF(sub_obj);
9014 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009015 if (kind2 != kind && buf2)
9016 PyMem_Free(buf2);
9017 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018}
9019
Alexander Belopolsky40018472011-02-26 01:02:56 +00009020Py_ssize_t
9021PyUnicode_Find(PyObject *str,
9022 PyObject *sub,
9023 Py_ssize_t start,
9024 Py_ssize_t end,
9025 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009027 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009028
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009030 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009031 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009032 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009033 if (!sub) {
9034 Py_DECREF(str);
9035 return -2;
9036 }
9037 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9038 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009039 Py_DECREF(str);
9040 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009041 }
Tim Petersced69f82003-09-16 20:30:58 +00009042
Victor Stinner794d5672011-10-10 03:21:36 +02009043 result = any_find_slice(direction,
9044 str, sub, start, end
9045 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009046
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009048 Py_DECREF(sub);
9049
Guido van Rossumd57fd912000-03-10 22:53:23 +00009050 return result;
9051}
9052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053Py_ssize_t
9054PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9055 Py_ssize_t start, Py_ssize_t end,
9056 int direction)
9057{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009058 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009059 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 if (PyUnicode_READY(str) == -1)
9061 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009062 if (start < 0 || end < 0) {
9063 PyErr_SetString(PyExc_IndexError, "string index out of range");
9064 return -2;
9065 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066 if (end > PyUnicode_GET_LENGTH(str))
9067 end = PyUnicode_GET_LENGTH(str);
9068 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009069 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9070 kind, end-start, ch, direction);
9071 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009072 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009073 else
9074 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009075}
9076
Alexander Belopolsky40018472011-02-26 01:02:56 +00009077static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009078tailmatch(PyObject *self,
9079 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009080 Py_ssize_t start,
9081 Py_ssize_t end,
9082 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009084 int kind_self;
9085 int kind_sub;
9086 void *data_self;
9087 void *data_sub;
9088 Py_ssize_t offset;
9089 Py_ssize_t i;
9090 Py_ssize_t end_sub;
9091
9092 if (PyUnicode_READY(self) == -1 ||
9093 PyUnicode_READY(substring) == -1)
9094 return 0;
9095
9096 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097 return 1;
9098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9100 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009101 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009102 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104 kind_self = PyUnicode_KIND(self);
9105 data_self = PyUnicode_DATA(self);
9106 kind_sub = PyUnicode_KIND(substring);
9107 data_sub = PyUnicode_DATA(substring);
9108 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9109
9110 if (direction > 0)
9111 offset = end;
9112 else
9113 offset = start;
9114
9115 if (PyUnicode_READ(kind_self, data_self, offset) ==
9116 PyUnicode_READ(kind_sub, data_sub, 0) &&
9117 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9118 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9119 /* If both are of the same kind, memcmp is sufficient */
9120 if (kind_self == kind_sub) {
9121 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009122 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009123 data_sub,
9124 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009125 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 }
9127 /* otherwise we have to compare each character by first accesing it */
9128 else {
9129 /* We do not need to compare 0 and len(substring)-1 because
9130 the if statement above ensured already that they are equal
9131 when we end up here. */
9132 // TODO: honor direction and do a forward or backwards search
9133 for (i = 1; i < end_sub; ++i) {
9134 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9135 PyUnicode_READ(kind_sub, data_sub, i))
9136 return 0;
9137 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140 }
9141
9142 return 0;
9143}
9144
Alexander Belopolsky40018472011-02-26 01:02:56 +00009145Py_ssize_t
9146PyUnicode_Tailmatch(PyObject *str,
9147 PyObject *substr,
9148 Py_ssize_t start,
9149 Py_ssize_t end,
9150 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009152 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009153
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154 str = PyUnicode_FromObject(str);
9155 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009156 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157 substr = PyUnicode_FromObject(substr);
9158 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 Py_DECREF(str);
9160 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009161 }
Tim Petersced69f82003-09-16 20:30:58 +00009162
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009163 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009164 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009165 Py_DECREF(str);
9166 Py_DECREF(substr);
9167 return result;
9168}
9169
Guido van Rossumd57fd912000-03-10 22:53:23 +00009170/* Apply fixfct filter to the Unicode object self and return a
9171 reference to the modified object */
9172
Alexander Belopolsky40018472011-02-26 01:02:56 +00009173static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009174fixup(PyObject *self,
9175 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009176{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009177 PyObject *u;
9178 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009179 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009180
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009181 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009182 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009183 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009184 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009186 /* fix functions return the new maximum character in a string,
9187 if the kind of the resulting unicode object does not change,
9188 everything is fine. Otherwise we need to change the string kind
9189 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009190 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009191
9192 if (maxchar_new == 0) {
9193 /* no changes */;
9194 if (PyUnicode_CheckExact(self)) {
9195 Py_DECREF(u);
9196 Py_INCREF(self);
9197 return self;
9198 }
9199 else
9200 return u;
9201 }
9202
Victor Stinnere6abb482012-05-02 01:15:40 +02009203 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009204
Victor Stinnereaab6042011-12-11 22:22:39 +01009205 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009206 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009207
9208 /* In case the maximum character changed, we need to
9209 convert the string to the new category. */
9210 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9211 if (v == NULL) {
9212 Py_DECREF(u);
9213 return NULL;
9214 }
9215 if (maxchar_new > maxchar_old) {
9216 /* If the maxchar increased so that the kind changed, not all
9217 characters are representable anymore and we need to fix the
9218 string again. This only happens in very few cases. */
9219 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9220 maxchar_old = fixfct(v);
9221 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009222 }
9223 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009224 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009225 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009226 Py_DECREF(u);
9227 assert(_PyUnicode_CheckConsistency(v, 1));
9228 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009229}
9230
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009231static PyObject *
9232ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009233{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009234 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9235 char *resdata, *data = PyUnicode_DATA(self);
9236 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009237
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009238 res = PyUnicode_New(len, 127);
9239 if (res == NULL)
9240 return NULL;
9241 resdata = PyUnicode_DATA(res);
9242 if (lower)
9243 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009245 _Py_bytes_upper(resdata, data, len);
9246 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009247}
9248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009249static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009250handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009251{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009252 Py_ssize_t j;
9253 int final_sigma;
9254 Py_UCS4 c;
9255 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009256
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009257 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9258
9259 where ! is a negation and \p{xxx} is a character with property xxx.
9260 */
9261 for (j = i - 1; j >= 0; j--) {
9262 c = PyUnicode_READ(kind, data, j);
9263 if (!_PyUnicode_IsCaseIgnorable(c))
9264 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009265 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009266 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9267 if (final_sigma) {
9268 for (j = i + 1; j < length; j++) {
9269 c = PyUnicode_READ(kind, data, j);
9270 if (!_PyUnicode_IsCaseIgnorable(c))
9271 break;
9272 }
9273 final_sigma = j == length || !_PyUnicode_IsCased(c);
9274 }
9275 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009276}
9277
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009278static int
9279lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9280 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009281{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009282 /* Obscure special case. */
9283 if (c == 0x3A3) {
9284 mapped[0] = handle_capital_sigma(kind, data, length, i);
9285 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009286 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009287 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009288}
9289
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009290static Py_ssize_t
9291do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009292{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009293 Py_ssize_t i, k = 0;
9294 int n_res, j;
9295 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009296
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009297 c = PyUnicode_READ(kind, data, 0);
9298 n_res = _PyUnicode_ToUpperFull(c, mapped);
9299 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009300 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009301 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009302 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009303 for (i = 1; i < length; i++) {
9304 c = PyUnicode_READ(kind, data, i);
9305 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9306 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009307 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009308 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009309 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009310 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009311 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312}
9313
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009314static Py_ssize_t
9315do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9316 Py_ssize_t i, k = 0;
9317
9318 for (i = 0; i < length; i++) {
9319 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9320 int n_res, j;
9321 if (Py_UNICODE_ISUPPER(c)) {
9322 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9323 }
9324 else if (Py_UNICODE_ISLOWER(c)) {
9325 n_res = _PyUnicode_ToUpperFull(c, mapped);
9326 }
9327 else {
9328 n_res = 1;
9329 mapped[0] = c;
9330 }
9331 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009332 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009333 res[k++] = mapped[j];
9334 }
9335 }
9336 return k;
9337}
9338
9339static Py_ssize_t
9340do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9341 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009342{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009343 Py_ssize_t i, k = 0;
9344
9345 for (i = 0; i < length; i++) {
9346 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9347 int n_res, j;
9348 if (lower)
9349 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9350 else
9351 n_res = _PyUnicode_ToUpperFull(c, mapped);
9352 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009353 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009354 res[k++] = mapped[j];
9355 }
9356 }
9357 return k;
9358}
9359
9360static Py_ssize_t
9361do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9362{
9363 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9364}
9365
9366static Py_ssize_t
9367do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9368{
9369 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9370}
9371
Benjamin Petersone51757f2012-01-12 21:10:29 -05009372static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009373do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9374{
9375 Py_ssize_t i, k = 0;
9376
9377 for (i = 0; i < length; i++) {
9378 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9379 Py_UCS4 mapped[3];
9380 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9381 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009382 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009383 res[k++] = mapped[j];
9384 }
9385 }
9386 return k;
9387}
9388
9389static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009390do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9391{
9392 Py_ssize_t i, k = 0;
9393 int previous_is_cased;
9394
9395 previous_is_cased = 0;
9396 for (i = 0; i < length; i++) {
9397 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9398 Py_UCS4 mapped[3];
9399 int n_res, j;
9400
9401 if (previous_is_cased)
9402 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9403 else
9404 n_res = _PyUnicode_ToTitleFull(c, mapped);
9405
9406 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009407 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009408 res[k++] = mapped[j];
9409 }
9410
9411 previous_is_cased = _PyUnicode_IsCased(c);
9412 }
9413 return k;
9414}
9415
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009416static PyObject *
9417case_operation(PyObject *self,
9418 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9419{
9420 PyObject *res = NULL;
9421 Py_ssize_t length, newlength = 0;
9422 int kind, outkind;
9423 void *data, *outdata;
9424 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9425
Benjamin Petersoneea48462012-01-16 14:28:50 -05009426 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009427
9428 kind = PyUnicode_KIND(self);
9429 data = PyUnicode_DATA(self);
9430 length = PyUnicode_GET_LENGTH(self);
9431 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9432 if (tmp == NULL)
9433 return PyErr_NoMemory();
9434 newlength = perform(kind, data, length, tmp, &maxchar);
9435 res = PyUnicode_New(newlength, maxchar);
9436 if (res == NULL)
9437 goto leave;
9438 tmpend = tmp + newlength;
9439 outdata = PyUnicode_DATA(res);
9440 outkind = PyUnicode_KIND(res);
9441 switch (outkind) {
9442 case PyUnicode_1BYTE_KIND:
9443 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9444 break;
9445 case PyUnicode_2BYTE_KIND:
9446 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9447 break;
9448 case PyUnicode_4BYTE_KIND:
9449 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9450 break;
9451 default:
9452 assert(0);
9453 break;
9454 }
9455 leave:
9456 PyMem_FREE(tmp);
9457 return res;
9458}
9459
Tim Peters8ce9f162004-08-27 01:49:32 +00009460PyObject *
9461PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009464 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009466 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009467 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9468 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009469 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009471 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009473 int use_memcpy;
9474 unsigned char *res_data = NULL, *sep_data = NULL;
9475 PyObject *last_obj;
9476 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477
Tim Peters05eba1f2004-08-27 21:32:02 +00009478 fseq = PySequence_Fast(seq, "");
9479 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009480 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009481 }
9482
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009483 /* NOTE: the following code can't call back into Python code,
9484 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009485 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009486
Tim Peters05eba1f2004-08-27 21:32:02 +00009487 seqlen = PySequence_Fast_GET_SIZE(fseq);
9488 /* If empty sequence, return u"". */
9489 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009490 Py_DECREF(fseq);
9491 Py_INCREF(unicode_empty);
9492 res = unicode_empty;
9493 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009494 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009495
Tim Peters05eba1f2004-08-27 21:32:02 +00009496 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009497 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009498 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009499 if (seqlen == 1) {
9500 if (PyUnicode_CheckExact(items[0])) {
9501 res = items[0];
9502 Py_INCREF(res);
9503 Py_DECREF(fseq);
9504 return res;
9505 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009506 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009507 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009508 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009509 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009510 /* Set up sep and seplen */
9511 if (separator == NULL) {
9512 /* fall back to a blank space separator */
9513 sep = PyUnicode_FromOrdinal(' ');
9514 if (!sep)
9515 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009516 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009517 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009518 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009519 else {
9520 if (!PyUnicode_Check(separator)) {
9521 PyErr_Format(PyExc_TypeError,
9522 "separator: expected str instance,"
9523 " %.80s found",
9524 Py_TYPE(separator)->tp_name);
9525 goto onError;
9526 }
9527 if (PyUnicode_READY(separator))
9528 goto onError;
9529 sep = separator;
9530 seplen = PyUnicode_GET_LENGTH(separator);
9531 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9532 /* inc refcount to keep this code path symmetric with the
9533 above case of a blank separator */
9534 Py_INCREF(sep);
9535 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009536 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009537 }
9538
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009539 /* There are at least two things to join, or else we have a subclass
9540 * of str in the sequence.
9541 * Do a pre-pass to figure out the total amount of space we'll
9542 * need (sz), and see whether all argument are strings.
9543 */
9544 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009545#ifdef Py_DEBUG
9546 use_memcpy = 0;
9547#else
9548 use_memcpy = 1;
9549#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009550 for (i = 0; i < seqlen; i++) {
9551 const Py_ssize_t old_sz = sz;
9552 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009553 if (!PyUnicode_Check(item)) {
9554 PyErr_Format(PyExc_TypeError,
9555 "sequence item %zd: expected str instance,"
9556 " %.80s found",
9557 i, Py_TYPE(item)->tp_name);
9558 goto onError;
9559 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 if (PyUnicode_READY(item) == -1)
9561 goto onError;
9562 sz += PyUnicode_GET_LENGTH(item);
9563 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009564 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009565 if (i != 0)
9566 sz += seplen;
9567 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9568 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009569 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009570 goto onError;
9571 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009572 if (use_memcpy && last_obj != NULL) {
9573 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9574 use_memcpy = 0;
9575 }
9576 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009577 }
Tim Petersced69f82003-09-16 20:30:58 +00009578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009580 if (res == NULL)
9581 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009582
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009583 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009584#ifdef Py_DEBUG
9585 use_memcpy = 0;
9586#else
9587 if (use_memcpy) {
9588 res_data = PyUnicode_1BYTE_DATA(res);
9589 kind = PyUnicode_KIND(res);
9590 if (seplen != 0)
9591 sep_data = PyUnicode_1BYTE_DATA(sep);
9592 }
9593#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009595 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009596 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009597 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009598 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009599 if (use_memcpy) {
9600 Py_MEMCPY(res_data,
9601 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009602 kind * seplen);
9603 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009604 }
9605 else {
9606 copy_characters(res, res_offset, sep, 0, seplen);
9607 res_offset += seplen;
9608 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009609 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009610 itemlen = PyUnicode_GET_LENGTH(item);
9611 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009612 if (use_memcpy) {
9613 Py_MEMCPY(res_data,
9614 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009615 kind * itemlen);
9616 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009617 }
9618 else {
9619 copy_characters(res, res_offset, item, 0, itemlen);
9620 res_offset += itemlen;
9621 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009622 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009623 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009624 if (use_memcpy)
9625 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009626 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009627 else
9628 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009629
Tim Peters05eba1f2004-08-27 21:32:02 +00009630 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009632 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009633 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634
Benjamin Peterson29060642009-01-31 22:14:21 +00009635 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009636 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009638 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639 return NULL;
9640}
9641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642#define FILL(kind, data, value, start, length) \
9643 do { \
9644 Py_ssize_t i_ = 0; \
9645 assert(kind != PyUnicode_WCHAR_KIND); \
9646 switch ((kind)) { \
9647 case PyUnicode_1BYTE_KIND: { \
9648 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009649 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 break; \
9651 } \
9652 case PyUnicode_2BYTE_KIND: { \
9653 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9654 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9655 break; \
9656 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009657 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009658 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9659 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9660 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009661 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009662 } \
9663 } \
9664 } while (0)
9665
Victor Stinner3fe55312012-01-04 00:33:50 +01009666Py_ssize_t
9667PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9668 Py_UCS4 fill_char)
9669{
9670 Py_ssize_t maxlen;
9671 enum PyUnicode_Kind kind;
9672 void *data;
9673
9674 if (!PyUnicode_Check(unicode)) {
9675 PyErr_BadInternalCall();
9676 return -1;
9677 }
9678 if (PyUnicode_READY(unicode) == -1)
9679 return -1;
9680 if (unicode_check_modifiable(unicode))
9681 return -1;
9682
9683 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9684 PyErr_SetString(PyExc_ValueError,
9685 "fill character is bigger than "
9686 "the string maximum character");
9687 return -1;
9688 }
9689
9690 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9691 length = Py_MIN(maxlen, length);
9692 if (length <= 0)
9693 return 0;
9694
9695 kind = PyUnicode_KIND(unicode);
9696 data = PyUnicode_DATA(unicode);
9697 FILL(kind, data, fill_char, start, length);
9698 return length;
9699}
9700
Victor Stinner9310abb2011-10-05 00:59:23 +02009701static PyObject *
9702pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009703 Py_ssize_t left,
9704 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009706{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 PyObject *u;
9708 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009709 int kind;
9710 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009711
9712 if (left < 0)
9713 left = 0;
9714 if (right < 0)
9715 right = 0;
9716
Victor Stinnerc4b49542011-12-11 22:44:26 +01009717 if (left == 0 && right == 0)
9718 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9721 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009722 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9723 return NULL;
9724 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009725 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009726 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009728 if (!u)
9729 return NULL;
9730
9731 kind = PyUnicode_KIND(u);
9732 data = PyUnicode_DATA(u);
9733 if (left)
9734 FILL(kind, data, fill, 0, left);
9735 if (right)
9736 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009737 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009738 assert(_PyUnicode_CheckConsistency(u, 1));
9739 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740}
9741
Alexander Belopolsky40018472011-02-26 01:02:56 +00009742PyObject *
9743PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009744{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009745 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009746
9747 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009748 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009749 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009750 if (PyUnicode_READY(string) == -1) {
9751 Py_DECREF(string);
9752 return NULL;
9753 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754
Benjamin Petersonead6b532011-12-20 17:23:42 -06009755 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009757 if (PyUnicode_IS_ASCII(string))
9758 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009759 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009760 PyUnicode_GET_LENGTH(string), keepends);
9761 else
9762 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009763 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009764 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009765 break;
9766 case PyUnicode_2BYTE_KIND:
9767 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009768 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009769 PyUnicode_GET_LENGTH(string), keepends);
9770 break;
9771 case PyUnicode_4BYTE_KIND:
9772 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009773 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009774 PyUnicode_GET_LENGTH(string), keepends);
9775 break;
9776 default:
9777 assert(0);
9778 list = 0;
9779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009780 Py_DECREF(string);
9781 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009782}
9783
Alexander Belopolsky40018472011-02-26 01:02:56 +00009784static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009785split(PyObject *self,
9786 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009787 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 int kind1, kind2, kind;
9790 void *buf1, *buf2;
9791 Py_ssize_t len1, len2;
9792 PyObject* out;
9793
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009795 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 if (PyUnicode_READY(self) == -1)
9798 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009800 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009801 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009802 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009803 if (PyUnicode_IS_ASCII(self))
9804 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009805 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009806 PyUnicode_GET_LENGTH(self), maxcount
9807 );
9808 else
9809 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009810 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009811 PyUnicode_GET_LENGTH(self), maxcount
9812 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 case PyUnicode_2BYTE_KIND:
9814 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009815 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 PyUnicode_GET_LENGTH(self), maxcount
9817 );
9818 case PyUnicode_4BYTE_KIND:
9819 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009820 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009821 PyUnicode_GET_LENGTH(self), maxcount
9822 );
9823 default:
9824 assert(0);
9825 return NULL;
9826 }
9827
9828 if (PyUnicode_READY(substring) == -1)
9829 return NULL;
9830
9831 kind1 = PyUnicode_KIND(self);
9832 kind2 = PyUnicode_KIND(substring);
9833 kind = kind1 > kind2 ? kind1 : kind2;
9834 buf1 = PyUnicode_DATA(self);
9835 buf2 = PyUnicode_DATA(substring);
9836 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009837 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009838 if (!buf1)
9839 return NULL;
9840 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009841 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009842 if (!buf2) {
9843 if (kind1 != kind) PyMem_Free(buf1);
9844 return NULL;
9845 }
9846 len1 = PyUnicode_GET_LENGTH(self);
9847 len2 = PyUnicode_GET_LENGTH(substring);
9848
Benjamin Petersonead6b532011-12-20 17:23:42 -06009849 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009851 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9852 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009853 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009854 else
9855 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009856 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 break;
9858 case PyUnicode_2BYTE_KIND:
9859 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009860 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 break;
9862 case PyUnicode_4BYTE_KIND:
9863 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009864 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 break;
9866 default:
9867 out = NULL;
9868 }
9869 if (kind1 != kind)
9870 PyMem_Free(buf1);
9871 if (kind2 != kind)
9872 PyMem_Free(buf2);
9873 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009874}
9875
Alexander Belopolsky40018472011-02-26 01:02:56 +00009876static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009877rsplit(PyObject *self,
9878 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009879 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009880{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881 int kind1, kind2, kind;
9882 void *buf1, *buf2;
9883 Py_ssize_t len1, len2;
9884 PyObject* out;
9885
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009886 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009887 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009888
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889 if (PyUnicode_READY(self) == -1)
9890 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009891
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009893 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009894 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009895 if (PyUnicode_IS_ASCII(self))
9896 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009897 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009898 PyUnicode_GET_LENGTH(self), maxcount
9899 );
9900 else
9901 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009902 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009903 PyUnicode_GET_LENGTH(self), maxcount
9904 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 case PyUnicode_2BYTE_KIND:
9906 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009907 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908 PyUnicode_GET_LENGTH(self), maxcount
9909 );
9910 case PyUnicode_4BYTE_KIND:
9911 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009912 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 PyUnicode_GET_LENGTH(self), maxcount
9914 );
9915 default:
9916 assert(0);
9917 return NULL;
9918 }
9919
9920 if (PyUnicode_READY(substring) == -1)
9921 return NULL;
9922
9923 kind1 = PyUnicode_KIND(self);
9924 kind2 = PyUnicode_KIND(substring);
9925 kind = kind1 > kind2 ? kind1 : kind2;
9926 buf1 = PyUnicode_DATA(self);
9927 buf2 = PyUnicode_DATA(substring);
9928 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009929 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 if (!buf1)
9931 return NULL;
9932 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009933 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 if (!buf2) {
9935 if (kind1 != kind) PyMem_Free(buf1);
9936 return NULL;
9937 }
9938 len1 = PyUnicode_GET_LENGTH(self);
9939 len2 = PyUnicode_GET_LENGTH(substring);
9940
Benjamin Petersonead6b532011-12-20 17:23:42 -06009941 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009942 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009943 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9944 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009945 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009946 else
9947 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009948 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 break;
9950 case PyUnicode_2BYTE_KIND:
9951 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009952 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953 break;
9954 case PyUnicode_4BYTE_KIND:
9955 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009956 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 break;
9958 default:
9959 out = NULL;
9960 }
9961 if (kind1 != kind)
9962 PyMem_Free(buf1);
9963 if (kind2 != kind)
9964 PyMem_Free(buf2);
9965 return out;
9966}
9967
9968static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009969anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9970 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009971{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009972 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009974 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9975 return asciilib_find(buf1, len1, buf2, len2, offset);
9976 else
9977 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978 case PyUnicode_2BYTE_KIND:
9979 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9980 case PyUnicode_4BYTE_KIND:
9981 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9982 }
9983 assert(0);
9984 return -1;
9985}
9986
9987static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009988anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9989 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009991 switch (kind) {
9992 case PyUnicode_1BYTE_KIND:
9993 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9994 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9995 else
9996 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9997 case PyUnicode_2BYTE_KIND:
9998 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9999 case PyUnicode_4BYTE_KIND:
10000 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10001 }
10002 assert(0);
10003 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010004}
10005
Alexander Belopolsky40018472011-02-26 01:02:56 +000010006static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010007replace(PyObject *self, PyObject *str1,
10008 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010009{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 PyObject *u;
10011 char *sbuf = PyUnicode_DATA(self);
10012 char *buf1 = PyUnicode_DATA(str1);
10013 char *buf2 = PyUnicode_DATA(str2);
10014 int srelease = 0, release1 = 0, release2 = 0;
10015 int skind = PyUnicode_KIND(self);
10016 int kind1 = PyUnicode_KIND(str1);
10017 int kind2 = PyUnicode_KIND(str2);
10018 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10019 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10020 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010021 int mayshrink;
10022 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023
10024 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010025 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010027 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028
Victor Stinner59de0ee2011-10-07 10:01:28 +020010029 if (str1 == str2)
10030 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 if (skind < kind1)
10032 /* substring too wide to be present */
10033 goto nothing;
10034
Victor Stinner49a0a212011-10-12 23:46:10 +020010035 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10036 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10037 /* Replacing str1 with str2 may cause a maxchar reduction in the
10038 result string. */
10039 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +020010040 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010043 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010045 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010047 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010048 Py_UCS4 u1, u2;
10049 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010050 Py_ssize_t index, pos;
10051 char *src;
10052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010054 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10055 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010056 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010059 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010061 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010063
10064 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10065 index = 0;
10066 src = sbuf;
10067 while (--maxcount)
10068 {
10069 pos++;
10070 src += pos * PyUnicode_KIND(self);
10071 slen -= pos;
10072 index += pos;
10073 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10074 if (pos < 0)
10075 break;
10076 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10077 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010078 }
10079 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 int rkind = skind;
10081 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010082 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 if (kind1 < rkind) {
10085 /* widen substring */
10086 buf1 = _PyUnicode_AsKind(str1, rkind);
10087 if (!buf1) goto error;
10088 release1 = 1;
10089 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010090 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010091 if (i < 0)
10092 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 if (rkind > kind2) {
10094 /* widen replacement */
10095 buf2 = _PyUnicode_AsKind(str2, rkind);
10096 if (!buf2) goto error;
10097 release2 = 1;
10098 }
10099 else if (rkind < kind2) {
10100 /* widen self and buf1 */
10101 rkind = kind2;
10102 if (release1) PyMem_Free(buf1);
10103 sbuf = _PyUnicode_AsKind(self, rkind);
10104 if (!sbuf) goto error;
10105 srelease = 1;
10106 buf1 = _PyUnicode_AsKind(str1, rkind);
10107 if (!buf1) goto error;
10108 release1 = 1;
10109 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010110 u = PyUnicode_New(slen, maxchar);
10111 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010113 assert(PyUnicode_KIND(u) == rkind);
10114 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010115
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010116 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010117 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010118 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010120 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010122
10123 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010124 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010125 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010126 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010127 if (i == -1)
10128 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010129 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010131 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010133 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010134 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010135 }
10136 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 Py_ssize_t n, i, j, ires;
10138 Py_ssize_t product, new_size;
10139 int rkind = skind;
10140 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010143 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 buf1 = _PyUnicode_AsKind(str1, rkind);
10145 if (!buf1) goto error;
10146 release1 = 1;
10147 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010148 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010149 if (n == 0)
10150 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010152 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 buf2 = _PyUnicode_AsKind(str2, rkind);
10154 if (!buf2) goto error;
10155 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010156 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010158 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 rkind = kind2;
10160 sbuf = _PyUnicode_AsKind(self, rkind);
10161 if (!sbuf) goto error;
10162 srelease = 1;
10163 if (release1) PyMem_Free(buf1);
10164 buf1 = _PyUnicode_AsKind(str1, rkind);
10165 if (!buf1) goto error;
10166 release1 = 1;
10167 }
10168 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10169 PyUnicode_GET_LENGTH(str1))); */
10170 product = n * (len2-len1);
10171 if ((product / (len2-len1)) != n) {
10172 PyErr_SetString(PyExc_OverflowError,
10173 "replace string is too long");
10174 goto error;
10175 }
10176 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010177 if (new_size == 0) {
10178 Py_INCREF(unicode_empty);
10179 u = unicode_empty;
10180 goto done;
10181 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10183 PyErr_SetString(PyExc_OverflowError,
10184 "replace string is too long");
10185 goto error;
10186 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010187 u = PyUnicode_New(new_size, maxchar);
10188 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010190 assert(PyUnicode_KIND(u) == rkind);
10191 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 ires = i = 0;
10193 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010194 while (n-- > 0) {
10195 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010196 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010197 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010198 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010199 if (j == -1)
10200 break;
10201 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010202 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010203 memcpy(res + rkind * ires,
10204 sbuf + rkind * i,
10205 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010207 }
10208 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010210 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010212 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010218 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010219 memcpy(res + rkind * ires,
10220 sbuf + rkind * i,
10221 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010222 }
10223 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010224 /* interleave */
10225 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010226 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010228 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010230 if (--n <= 0)
10231 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010232 memcpy(res + rkind * ires,
10233 sbuf + rkind * i,
10234 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 ires++;
10236 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010237 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010238 memcpy(res + rkind * ires,
10239 sbuf + rkind * i,
10240 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010241 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010242 }
10243
10244 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010245 unicode_adjust_maxchar(&u);
10246 if (u == NULL)
10247 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010249
10250 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 if (srelease)
10252 PyMem_FREE(sbuf);
10253 if (release1)
10254 PyMem_FREE(buf1);
10255 if (release2)
10256 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010257 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010259
Benjamin Peterson29060642009-01-31 22:14:21 +000010260 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010261 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 if (srelease)
10263 PyMem_FREE(sbuf);
10264 if (release1)
10265 PyMem_FREE(buf1);
10266 if (release2)
10267 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010268 return unicode_result_unchanged(self);
10269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 error:
10271 if (srelease && sbuf)
10272 PyMem_FREE(sbuf);
10273 if (release1 && buf1)
10274 PyMem_FREE(buf1);
10275 if (release2 && buf2)
10276 PyMem_FREE(buf2);
10277 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010278}
10279
10280/* --- Unicode Object Methods --------------------------------------------- */
10281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010282PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010283 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284\n\
10285Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010286characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287
10288static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010289unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010290{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010291 if (PyUnicode_READY(self) == -1)
10292 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010293 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010294}
10295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010296PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010297 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298\n\
10299Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010300have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010301
10302static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010303unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010305 if (PyUnicode_READY(self) == -1)
10306 return NULL;
10307 if (PyUnicode_GET_LENGTH(self) == 0)
10308 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010309 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310}
10311
Benjamin Petersond5890c82012-01-14 13:23:30 -050010312PyDoc_STRVAR(casefold__doc__,
10313 "S.casefold() -> str\n\
10314\n\
10315Return a version of S suitable for caseless comparisons.");
10316
10317static PyObject *
10318unicode_casefold(PyObject *self)
10319{
10320 if (PyUnicode_READY(self) == -1)
10321 return NULL;
10322 if (PyUnicode_IS_ASCII(self))
10323 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010324 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010325}
10326
10327
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010328/* Argument converter. Coerces to a single unicode character */
10329
10330static int
10331convert_uc(PyObject *obj, void *addr)
10332{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010334 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010335
Benjamin Peterson14339b62009-01-31 16:36:08 +000010336 uniobj = PyUnicode_FromObject(obj);
10337 if (uniobj == NULL) {
10338 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010339 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010340 return 0;
10341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010343 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010344 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010345 Py_DECREF(uniobj);
10346 return 0;
10347 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010349 Py_DECREF(uniobj);
10350 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010351}
10352
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010353PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010354 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010356Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010357done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010358
10359static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010360unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010361{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010362 Py_ssize_t marg, left;
10363 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 Py_UCS4 fillchar = ' ';
10365
Victor Stinnere9a29352011-10-01 02:14:59 +020010366 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368
Benjamin Petersonbac79492012-01-14 13:34:47 -050010369 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370 return NULL;
10371
Victor Stinnerc4b49542011-12-11 22:44:26 +010010372 if (PyUnicode_GET_LENGTH(self) >= width)
10373 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374
Victor Stinnerc4b49542011-12-11 22:44:26 +010010375 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376 left = marg / 2 + (marg & width & 1);
10377
Victor Stinner9310abb2011-10-05 00:59:23 +020010378 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379}
10380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381/* This function assumes that str1 and str2 are readied by the caller. */
10382
Marc-André Lemburge5034372000-08-08 08:04:29 +000010383static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010384unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010385{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 int kind1, kind2;
10387 void *data1, *data2;
10388 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 kind1 = PyUnicode_KIND(str1);
10391 kind2 = PyUnicode_KIND(str2);
10392 data1 = PyUnicode_DATA(str1);
10393 data2 = PyUnicode_DATA(str2);
10394 len1 = PyUnicode_GET_LENGTH(str1);
10395 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010396
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 for (i = 0; i < len1 && i < len2; ++i) {
10398 Py_UCS4 c1, c2;
10399 c1 = PyUnicode_READ(kind1, data1, i);
10400 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010401
10402 if (c1 != c2)
10403 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010404 }
10405
10406 return (len1 < len2) ? -1 : (len1 != len2);
10407}
10408
Alexander Belopolsky40018472011-02-26 01:02:56 +000010409int
10410PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10413 if (PyUnicode_READY(left) == -1 ||
10414 PyUnicode_READY(right) == -1)
10415 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010416 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010418 PyErr_Format(PyExc_TypeError,
10419 "Can't compare %.100s and %.100s",
10420 left->ob_type->tp_name,
10421 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010422 return -1;
10423}
10424
Martin v. Löwis5b222132007-06-10 09:51:05 +000010425int
10426PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10427{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 Py_ssize_t i;
10429 int kind;
10430 void *data;
10431 Py_UCS4 chr;
10432
Victor Stinner910337b2011-10-03 03:20:16 +020010433 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 if (PyUnicode_READY(uni) == -1)
10435 return -1;
10436 kind = PyUnicode_KIND(uni);
10437 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010438 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10440 if (chr != str[i])
10441 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010442 /* This check keeps Python strings that end in '\0' from comparing equal
10443 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010445 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010446 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010447 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010448 return 0;
10449}
10450
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010451
Benjamin Peterson29060642009-01-31 22:14:21 +000010452#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010453 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010454
Alexander Belopolsky40018472011-02-26 01:02:56 +000010455PyObject *
10456PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010457{
10458 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010459
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010460 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10461 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 if (PyUnicode_READY(left) == -1 ||
10463 PyUnicode_READY(right) == -1)
10464 return NULL;
10465 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10466 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010467 if (op == Py_EQ) {
10468 Py_INCREF(Py_False);
10469 return Py_False;
10470 }
10471 if (op == Py_NE) {
10472 Py_INCREF(Py_True);
10473 return Py_True;
10474 }
10475 }
10476 if (left == right)
10477 result = 0;
10478 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010479 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010480
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010481 /* Convert the return value to a Boolean */
10482 switch (op) {
10483 case Py_EQ:
10484 v = TEST_COND(result == 0);
10485 break;
10486 case Py_NE:
10487 v = TEST_COND(result != 0);
10488 break;
10489 case Py_LE:
10490 v = TEST_COND(result <= 0);
10491 break;
10492 case Py_GE:
10493 v = TEST_COND(result >= 0);
10494 break;
10495 case Py_LT:
10496 v = TEST_COND(result == -1);
10497 break;
10498 case Py_GT:
10499 v = TEST_COND(result == 1);
10500 break;
10501 default:
10502 PyErr_BadArgument();
10503 return NULL;
10504 }
10505 Py_INCREF(v);
10506 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010507 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010508
Brian Curtindfc80e32011-08-10 20:28:54 -050010509 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010510}
10511
Alexander Belopolsky40018472011-02-26 01:02:56 +000010512int
10513PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010514{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010515 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 int kind1, kind2, kind;
10517 void *buf1, *buf2;
10518 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010519 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010520
10521 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010522 sub = PyUnicode_FromObject(element);
10523 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010524 PyErr_Format(PyExc_TypeError,
10525 "'in <string>' requires string as left operand, not %s",
10526 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010527 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010528 }
10529
Thomas Wouters477c8d52006-05-27 19:21:47 +000010530 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010531 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010532 Py_DECREF(sub);
10533 return -1;
10534 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010535 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10536 Py_DECREF(sub);
10537 Py_DECREF(str);
10538 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 kind1 = PyUnicode_KIND(str);
10541 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010542 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 buf1 = PyUnicode_DATA(str);
10544 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010545 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010546 if (kind2 > kind) {
10547 Py_DECREF(sub);
10548 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010549 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010550 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010551 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010552 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 if (!buf2) {
10554 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010555 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 return -1;
10557 }
10558 len1 = PyUnicode_GET_LENGTH(str);
10559 len2 = PyUnicode_GET_LENGTH(sub);
10560
Benjamin Petersonead6b532011-12-20 17:23:42 -060010561 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 case PyUnicode_1BYTE_KIND:
10563 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10564 break;
10565 case PyUnicode_2BYTE_KIND:
10566 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10567 break;
10568 case PyUnicode_4BYTE_KIND:
10569 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10570 break;
10571 default:
10572 result = -1;
10573 assert(0);
10574 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010575
10576 Py_DECREF(str);
10577 Py_DECREF(sub);
10578
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 if (kind2 != kind)
10580 PyMem_Free(buf2);
10581
Guido van Rossum403d68b2000-03-13 15:55:09 +000010582 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010583}
10584
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585/* Concat to string or Unicode object giving a new Unicode object. */
10586
Alexander Belopolsky40018472011-02-26 01:02:56 +000010587PyObject *
10588PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010591 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010592 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593
10594 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010597 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010600 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601
10602 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010603 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010604 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010607 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010608 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610 }
10611
Victor Stinner488fa492011-12-12 00:01:39 +010010612 u_len = PyUnicode_GET_LENGTH(u);
10613 v_len = PyUnicode_GET_LENGTH(v);
10614 if (u_len > PY_SSIZE_T_MAX - v_len) {
10615 PyErr_SetString(PyExc_OverflowError,
10616 "strings are too large to concat");
10617 goto onError;
10618 }
10619 new_len = u_len + v_len;
10620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010622 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010623 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624
Guido van Rossumd57fd912000-03-10 22:53:23 +000010625 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010626 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010627 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010628 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010010629 copy_characters(w, 0, u, 0, u_len);
10630 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010631 Py_DECREF(u);
10632 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010633 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010635
Benjamin Peterson29060642009-01-31 22:14:21 +000010636 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637 Py_XDECREF(u);
10638 Py_XDECREF(v);
10639 return NULL;
10640}
10641
Walter Dörwald1ab83302007-05-18 17:15:44 +000010642void
Victor Stinner23e56682011-10-03 03:54:37 +020010643PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010644{
Victor Stinner23e56682011-10-03 03:54:37 +020010645 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010646 Py_UCS4 maxchar, maxchar2;
10647 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010648
10649 if (p_left == NULL) {
10650 if (!PyErr_Occurred())
10651 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010652 return;
10653 }
Victor Stinner23e56682011-10-03 03:54:37 +020010654 left = *p_left;
10655 if (right == NULL || !PyUnicode_Check(left)) {
10656 if (!PyErr_Occurred())
10657 PyErr_BadInternalCall();
10658 goto error;
10659 }
10660
Benjamin Petersonbac79492012-01-14 13:34:47 -050010661 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010662 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010663 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010664 goto error;
10665
Victor Stinner488fa492011-12-12 00:01:39 +010010666 /* Shortcuts */
10667 if (left == unicode_empty) {
10668 Py_DECREF(left);
10669 Py_INCREF(right);
10670 *p_left = right;
10671 return;
10672 }
10673 if (right == unicode_empty)
10674 return;
10675
10676 left_len = PyUnicode_GET_LENGTH(left);
10677 right_len = PyUnicode_GET_LENGTH(right);
10678 if (left_len > PY_SSIZE_T_MAX - right_len) {
10679 PyErr_SetString(PyExc_OverflowError,
10680 "strings are too large to concat");
10681 goto error;
10682 }
10683 new_len = left_len + right_len;
10684
10685 if (unicode_modifiable(left)
10686 && PyUnicode_CheckExact(right)
10687 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010688 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10689 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010690 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010691 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010692 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10693 {
10694 /* append inplace */
10695 if (unicode_resize(p_left, new_len) != 0) {
10696 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10697 * deallocated so it cannot be put back into
10698 * 'variable'. The MemoryError is raised when there
10699 * is no value in 'variable', which might (very
10700 * remotely) be a cause of incompatibilities.
10701 */
10702 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010703 }
Victor Stinner488fa492011-12-12 00:01:39 +010010704 /* copy 'right' into the newly allocated area of 'left' */
10705 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010706 }
Victor Stinner488fa492011-12-12 00:01:39 +010010707 else {
10708 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10709 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010710 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010711
Victor Stinner488fa492011-12-12 00:01:39 +010010712 /* Concat the two Unicode strings */
10713 res = PyUnicode_New(new_len, maxchar);
10714 if (res == NULL)
10715 goto error;
10716 copy_characters(res, 0, left, 0, left_len);
10717 copy_characters(res, left_len, right, 0, right_len);
10718 Py_DECREF(left);
10719 *p_left = res;
10720 }
10721 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010722 return;
10723
10724error:
Victor Stinner488fa492011-12-12 00:01:39 +010010725 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010726}
10727
10728void
10729PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10730{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010731 PyUnicode_Append(pleft, right);
10732 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010733}
10734
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010735PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010736 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010738Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010739string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010740interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741
10742static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010743unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010744{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010745 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010746 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010747 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 int kind1, kind2, kind;
10750 void *buf1, *buf2;
10751 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752
Jesus Ceaac451502011-04-20 17:09:23 +020010753 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10754 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010755 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 kind1 = PyUnicode_KIND(self);
10758 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010759 if (kind2 > kind1)
10760 return PyLong_FromLong(0);
10761 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010762 buf1 = PyUnicode_DATA(self);
10763 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010764 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010765 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010766 if (!buf2) {
10767 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768 return NULL;
10769 }
10770 len1 = PyUnicode_GET_LENGTH(self);
10771 len2 = PyUnicode_GET_LENGTH(substring);
10772
10773 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010774 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 case PyUnicode_1BYTE_KIND:
10776 iresult = ucs1lib_count(
10777 ((Py_UCS1*)buf1) + start, end - start,
10778 buf2, len2, PY_SSIZE_T_MAX
10779 );
10780 break;
10781 case PyUnicode_2BYTE_KIND:
10782 iresult = ucs2lib_count(
10783 ((Py_UCS2*)buf1) + start, end - start,
10784 buf2, len2, PY_SSIZE_T_MAX
10785 );
10786 break;
10787 case PyUnicode_4BYTE_KIND:
10788 iresult = ucs4lib_count(
10789 ((Py_UCS4*)buf1) + start, end - start,
10790 buf2, len2, PY_SSIZE_T_MAX
10791 );
10792 break;
10793 default:
10794 assert(0); iresult = 0;
10795 }
10796
10797 result = PyLong_FromSsize_t(iresult);
10798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010799 if (kind2 != kind)
10800 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010801
10802 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010803
Guido van Rossumd57fd912000-03-10 22:53:23 +000010804 return result;
10805}
10806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010807PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010808 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010810Encode S using the codec registered for encoding. Default encoding\n\
10811is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010812handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010813a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10814'xmlcharrefreplace' as well as any other name registered with\n\
10815codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816
10817static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010818unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010820 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821 char *encoding = NULL;
10822 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010823
Benjamin Peterson308d6372009-09-18 21:42:35 +000010824 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10825 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010827 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010828}
10829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010830PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010831 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832\n\
10833Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010834If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835
10836static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010837unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010839 Py_ssize_t i, j, line_pos, src_len, incr;
10840 Py_UCS4 ch;
10841 PyObject *u;
10842 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010844 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010845 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846
10847 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849
Antoine Pitrou22425222011-10-04 19:10:51 +020010850 if (PyUnicode_READY(self) == -1)
10851 return NULL;
10852
Thomas Wouters7e474022000-07-16 12:04:32 +000010853 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010854 src_len = PyUnicode_GET_LENGTH(self);
10855 i = j = line_pos = 0;
10856 kind = PyUnicode_KIND(self);
10857 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010858 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010859 for (; i < src_len; i++) {
10860 ch = PyUnicode_READ(kind, src_data, i);
10861 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010862 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010863 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010864 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010865 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010866 goto overflow;
10867 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010868 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010869 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010872 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010873 goto overflow;
10874 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010875 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010876 if (ch == '\n' || ch == '\r')
10877 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010878 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010879 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010880 if (!found)
10881 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010882
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010884 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885 if (!u)
10886 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010887 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010888
Antoine Pitroue71d5742011-10-04 15:55:09 +020010889 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890
Antoine Pitroue71d5742011-10-04 15:55:09 +020010891 for (; i < src_len; i++) {
10892 ch = PyUnicode_READ(kind, src_data, i);
10893 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010894 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010895 incr = tabsize - (line_pos % tabsize);
10896 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010897 FILL(kind, dest_data, ' ', j, incr);
10898 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010899 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010900 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010901 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010902 line_pos++;
10903 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010904 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010905 if (ch == '\n' || ch == '\r')
10906 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010908 }
10909 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010910 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010911
Antoine Pitroue71d5742011-10-04 15:55:09 +020010912 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010913 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10914 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915}
10916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010917PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010918 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919\n\
10920Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010921such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922arguments start and end are interpreted as in slice notation.\n\
10923\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010924Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925
10926static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010929 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010930 Py_ssize_t start;
10931 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010932 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933
Jesus Ceaac451502011-04-20 17:09:23 +020010934 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10935 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010936 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 if (PyUnicode_READY(self) == -1)
10939 return NULL;
10940 if (PyUnicode_READY(substring) == -1)
10941 return NULL;
10942
Victor Stinner7931d9a2011-11-04 00:22:48 +010010943 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944
10945 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 if (result == -2)
10948 return NULL;
10949
Christian Heimes217cfd12007-12-02 14:31:20 +000010950 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951}
10952
10953static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010954unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010956 void *data;
10957 enum PyUnicode_Kind kind;
10958 Py_UCS4 ch;
10959 PyObject *res;
10960
10961 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10962 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010964 }
10965 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10966 PyErr_SetString(PyExc_IndexError, "string index out of range");
10967 return NULL;
10968 }
10969 kind = PyUnicode_KIND(self);
10970 data = PyUnicode_DATA(self);
10971 ch = PyUnicode_READ(kind, data, index);
10972 if (ch < 256)
10973 return get_latin1_char(ch);
10974
10975 res = PyUnicode_New(1, ch);
10976 if (res == NULL)
10977 return NULL;
10978 kind = PyUnicode_KIND(res);
10979 data = PyUnicode_DATA(res);
10980 PyUnicode_WRITE(kind, data, 0, ch);
10981 assert(_PyUnicode_CheckConsistency(res, 1));
10982 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983}
10984
Guido van Rossumc2504932007-09-18 19:42:40 +000010985/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010986 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010987static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010988unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989{
Guido van Rossumc2504932007-09-18 19:42:40 +000010990 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010991 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010992
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010993#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050010994 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010995#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 if (_PyUnicode_HASH(self) != -1)
10997 return _PyUnicode_HASH(self);
10998 if (PyUnicode_READY(self) == -1)
10999 return -1;
11000 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011001 /*
11002 We make the hash of the empty string be 0, rather than using
11003 (prefix ^ suffix), since this slightly obfuscates the hash secret
11004 */
11005 if (len == 0) {
11006 _PyUnicode_HASH(self) = 0;
11007 return 0;
11008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009
11010 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011011#define HASH(P) \
11012 x ^= (Py_uhash_t) *P << 7; \
11013 while (--len >= 0) \
11014 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011015
Georg Brandl2fb477c2012-02-21 00:33:36 +010011016 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017 switch (PyUnicode_KIND(self)) {
11018 case PyUnicode_1BYTE_KIND: {
11019 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11020 HASH(c);
11021 break;
11022 }
11023 case PyUnicode_2BYTE_KIND: {
11024 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11025 HASH(s);
11026 break;
11027 }
11028 default: {
11029 Py_UCS4 *l;
11030 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11031 "Impossible switch case in unicode_hash");
11032 l = PyUnicode_4BYTE_DATA(self);
11033 HASH(l);
11034 break;
11035 }
11036 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011037 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11038 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039
Guido van Rossumc2504932007-09-18 19:42:40 +000011040 if (x == -1)
11041 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011043 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011047PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011048 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011050Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051
11052static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011055 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011056 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011057 Py_ssize_t start;
11058 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059
Jesus Ceaac451502011-04-20 17:09:23 +020011060 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11061 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011064 if (PyUnicode_READY(self) == -1)
11065 return NULL;
11066 if (PyUnicode_READY(substring) == -1)
11067 return NULL;
11068
Victor Stinner7931d9a2011-11-04 00:22:48 +010011069 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070
11071 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011073 if (result == -2)
11074 return NULL;
11075
Guido van Rossumd57fd912000-03-10 22:53:23 +000011076 if (result < 0) {
11077 PyErr_SetString(PyExc_ValueError, "substring not found");
11078 return NULL;
11079 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011080
Christian Heimes217cfd12007-12-02 14:31:20 +000011081 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082}
11083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011084PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011085 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011086\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011087Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011088at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011089
11090static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011091unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011092{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 Py_ssize_t i, length;
11094 int kind;
11095 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096 int cased;
11097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011098 if (PyUnicode_READY(self) == -1)
11099 return NULL;
11100 length = PyUnicode_GET_LENGTH(self);
11101 kind = PyUnicode_KIND(self);
11102 data = PyUnicode_DATA(self);
11103
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011105 if (length == 1)
11106 return PyBool_FromLong(
11107 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011108
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011109 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011110 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011111 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011112
Guido van Rossumd57fd912000-03-10 22:53:23 +000011113 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011114 for (i = 0; i < length; i++) {
11115 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011116
Benjamin Peterson29060642009-01-31 22:14:21 +000011117 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11118 return PyBool_FromLong(0);
11119 else if (!cased && Py_UNICODE_ISLOWER(ch))
11120 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011122 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011123}
11124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011125PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011126 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011127\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011128Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011129at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011130
11131static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011132unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011133{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134 Py_ssize_t i, length;
11135 int kind;
11136 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137 int cased;
11138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139 if (PyUnicode_READY(self) == -1)
11140 return NULL;
11141 length = PyUnicode_GET_LENGTH(self);
11142 kind = PyUnicode_KIND(self);
11143 data = PyUnicode_DATA(self);
11144
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 if (length == 1)
11147 return PyBool_FromLong(
11148 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011150 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011152 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011153
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011155 for (i = 0; i < length; i++) {
11156 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011157
Benjamin Peterson29060642009-01-31 22:14:21 +000011158 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11159 return PyBool_FromLong(0);
11160 else if (!cased && Py_UNICODE_ISUPPER(ch))
11161 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011163 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164}
11165
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011166PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011167 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011169Return True if S is a titlecased string and there is at least one\n\
11170character in S, i.e. upper- and titlecase characters may only\n\
11171follow uncased characters and lowercase characters only cased ones.\n\
11172Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173
11174static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011175unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 Py_ssize_t i, length;
11178 int kind;
11179 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180 int cased, previous_is_cased;
11181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011182 if (PyUnicode_READY(self) == -1)
11183 return NULL;
11184 length = PyUnicode_GET_LENGTH(self);
11185 kind = PyUnicode_KIND(self);
11186 data = PyUnicode_DATA(self);
11187
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011189 if (length == 1) {
11190 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11191 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11192 (Py_UNICODE_ISUPPER(ch) != 0));
11193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011195 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011197 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011198
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199 cased = 0;
11200 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201 for (i = 0; i < length; i++) {
11202 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011203
Benjamin Peterson29060642009-01-31 22:14:21 +000011204 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11205 if (previous_is_cased)
11206 return PyBool_FromLong(0);
11207 previous_is_cased = 1;
11208 cased = 1;
11209 }
11210 else if (Py_UNICODE_ISLOWER(ch)) {
11211 if (!previous_is_cased)
11212 return PyBool_FromLong(0);
11213 previous_is_cased = 1;
11214 cased = 1;
11215 }
11216 else
11217 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011219 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220}
11221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011222PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011223 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011225Return True if all characters in S are whitespace\n\
11226and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227
11228static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011229unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 Py_ssize_t i, length;
11232 int kind;
11233 void *data;
11234
11235 if (PyUnicode_READY(self) == -1)
11236 return NULL;
11237 length = PyUnicode_GET_LENGTH(self);
11238 kind = PyUnicode_KIND(self);
11239 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011242 if (length == 1)
11243 return PyBool_FromLong(
11244 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011246 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011247 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011248 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250 for (i = 0; i < length; i++) {
11251 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011252 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011253 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011255 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256}
11257
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011258PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011259 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011260\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011261Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011262and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011263
11264static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011265unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011266{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267 Py_ssize_t i, length;
11268 int kind;
11269 void *data;
11270
11271 if (PyUnicode_READY(self) == -1)
11272 return NULL;
11273 length = PyUnicode_GET_LENGTH(self);
11274 kind = PyUnicode_KIND(self);
11275 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011276
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011277 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278 if (length == 1)
11279 return PyBool_FromLong(
11280 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011281
11282 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011284 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011286 for (i = 0; i < length; i++) {
11287 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011288 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011289 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011290 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011291}
11292
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011293PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011294 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011295\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011296Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011297and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011298
11299static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011300unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011301{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302 int kind;
11303 void *data;
11304 Py_ssize_t len, i;
11305
11306 if (PyUnicode_READY(self) == -1)
11307 return NULL;
11308
11309 kind = PyUnicode_KIND(self);
11310 data = PyUnicode_DATA(self);
11311 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011312
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011313 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011314 if (len == 1) {
11315 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11316 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11317 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011318
11319 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011321 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011323 for (i = 0; i < len; i++) {
11324 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011325 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011326 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011327 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011328 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011329}
11330
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011331PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011332 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011334Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011335False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336
11337static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011338unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340 Py_ssize_t i, length;
11341 int kind;
11342 void *data;
11343
11344 if (PyUnicode_READY(self) == -1)
11345 return NULL;
11346 length = PyUnicode_GET_LENGTH(self);
11347 kind = PyUnicode_KIND(self);
11348 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011351 if (length == 1)
11352 return PyBool_FromLong(
11353 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011355 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011357 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 for (i = 0; i < length; i++) {
11360 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011361 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011363 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364}
11365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011366PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011367 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011369Return True if all characters in S are digits\n\
11370and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371
11372static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011373unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375 Py_ssize_t i, length;
11376 int kind;
11377 void *data;
11378
11379 if (PyUnicode_READY(self) == -1)
11380 return NULL;
11381 length = PyUnicode_GET_LENGTH(self);
11382 kind = PyUnicode_KIND(self);
11383 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011384
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386 if (length == 1) {
11387 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11388 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11389 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011391 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011393 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011395 for (i = 0; i < length; i++) {
11396 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011397 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011399 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011400}
11401
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011402PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011403 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011405Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011406False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011407
11408static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011409unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 Py_ssize_t i, length;
11412 int kind;
11413 void *data;
11414
11415 if (PyUnicode_READY(self) == -1)
11416 return NULL;
11417 length = PyUnicode_GET_LENGTH(self);
11418 kind = PyUnicode_KIND(self);
11419 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011422 if (length == 1)
11423 return PyBool_FromLong(
11424 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011426 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011428 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 for (i = 0; i < length; i++) {
11431 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011432 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011434 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435}
11436
Martin v. Löwis47383402007-08-15 07:32:56 +000011437int
11438PyUnicode_IsIdentifier(PyObject *self)
11439{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 int kind;
11441 void *data;
11442 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011443 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 if (PyUnicode_READY(self) == -1) {
11446 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011447 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 }
11449
11450 /* Special case for empty strings */
11451 if (PyUnicode_GET_LENGTH(self) == 0)
11452 return 0;
11453 kind = PyUnicode_KIND(self);
11454 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011455
11456 /* PEP 3131 says that the first character must be in
11457 XID_Start and subsequent characters in XID_Continue,
11458 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011459 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011460 letters, digits, underscore). However, given the current
11461 definition of XID_Start and XID_Continue, it is sufficient
11462 to check just for these, except that _ must be allowed
11463 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011465 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011466 return 0;
11467
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011468 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011470 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011471 return 1;
11472}
11473
11474PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011475 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011476\n\
11477Return True if S is a valid identifier according\n\
11478to the language definition.");
11479
11480static PyObject*
11481unicode_isidentifier(PyObject *self)
11482{
11483 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11484}
11485
Georg Brandl559e5d72008-06-11 18:37:52 +000011486PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011487 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011488\n\
11489Return True if all characters in S are considered\n\
11490printable in repr() or S is empty, False otherwise.");
11491
11492static PyObject*
11493unicode_isprintable(PyObject *self)
11494{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011495 Py_ssize_t i, length;
11496 int kind;
11497 void *data;
11498
11499 if (PyUnicode_READY(self) == -1)
11500 return NULL;
11501 length = PyUnicode_GET_LENGTH(self);
11502 kind = PyUnicode_KIND(self);
11503 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011504
11505 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011506 if (length == 1)
11507 return PyBool_FromLong(
11508 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011510 for (i = 0; i < length; i++) {
11511 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011512 Py_RETURN_FALSE;
11513 }
11514 }
11515 Py_RETURN_TRUE;
11516}
11517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011518PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011519 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520\n\
11521Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011522iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523
11524static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011525unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011527 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528}
11529
Martin v. Löwis18e16552006-02-15 17:27:45 +000011530static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011531unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 if (PyUnicode_READY(self) == -1)
11534 return -1;
11535 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536}
11537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011538PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011539 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011541Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011542done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543
11544static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011545unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011547 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 Py_UCS4 fillchar = ' ';
11549
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011550 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551 return NULL;
11552
Benjamin Petersonbac79492012-01-14 13:34:47 -050011553 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011554 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555
Victor Stinnerc4b49542011-12-11 22:44:26 +010011556 if (PyUnicode_GET_LENGTH(self) >= width)
11557 return unicode_result_unchanged(self);
11558
11559 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560}
11561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011562PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011565Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566
11567static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011568unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011570 if (PyUnicode_READY(self) == -1)
11571 return NULL;
11572 if (PyUnicode_IS_ASCII(self))
11573 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011574 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575}
11576
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011577#define LEFTSTRIP 0
11578#define RIGHTSTRIP 1
11579#define BOTHSTRIP 2
11580
11581/* Arrays indexed by above */
11582static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11583
11584#define STRIPNAME(i) (stripformat[i]+3)
11585
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011586/* externally visible for str.strip(unicode) */
11587PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011588_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011589{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590 void *data;
11591 int kind;
11592 Py_ssize_t i, j, len;
11593 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011595 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11596 return NULL;
11597
11598 kind = PyUnicode_KIND(self);
11599 data = PyUnicode_DATA(self);
11600 len = PyUnicode_GET_LENGTH(self);
11601 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11602 PyUnicode_DATA(sepobj),
11603 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011604
Benjamin Peterson14339b62009-01-31 16:36:08 +000011605 i = 0;
11606 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607 while (i < len &&
11608 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011609 i++;
11610 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011611 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011612
Benjamin Peterson14339b62009-01-31 16:36:08 +000011613 j = len;
11614 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011615 do {
11616 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 } while (j >= i &&
11618 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011619 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011620 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011621
Victor Stinner7931d9a2011-11-04 00:22:48 +010011622 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011623}
11624
11625PyObject*
11626PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11627{
11628 unsigned char *data;
11629 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011630 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631
Victor Stinnerde636f32011-10-01 03:55:54 +020011632 if (PyUnicode_READY(self) == -1)
11633 return NULL;
11634
Victor Stinner684d5fd2012-05-03 02:32:34 +020011635 length = PyUnicode_GET_LENGTH(self);
11636 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011637
Victor Stinner684d5fd2012-05-03 02:32:34 +020011638 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011639 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640
Victor Stinnerde636f32011-10-01 03:55:54 +020011641 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011642 PyErr_SetString(PyExc_IndexError, "string index out of range");
11643 return NULL;
11644 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020011645 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020011646 Py_INCREF(unicode_empty);
11647 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020011648 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020011649
Victor Stinner684d5fd2012-05-03 02:32:34 +020011650 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011651 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011652 data = PyUnicode_1BYTE_DATA(self);
11653 return unicode_fromascii(data + start, length);
11654 }
11655 else {
11656 kind = PyUnicode_KIND(self);
11657 data = PyUnicode_1BYTE_DATA(self);
11658 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011659 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011660 length);
11661 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663
11664static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011665do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 int kind;
11668 void *data;
11669 Py_ssize_t len, i, j;
11670
11671 if (PyUnicode_READY(self) == -1)
11672 return NULL;
11673
11674 kind = PyUnicode_KIND(self);
11675 data = PyUnicode_DATA(self);
11676 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011677
Benjamin Peterson14339b62009-01-31 16:36:08 +000011678 i = 0;
11679 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011681 i++;
11682 }
11683 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011684
Benjamin Peterson14339b62009-01-31 16:36:08 +000011685 j = len;
11686 if (striptype != LEFTSTRIP) {
11687 do {
11688 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011690 j++;
11691 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011692
Victor Stinner7931d9a2011-11-04 00:22:48 +010011693 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694}
11695
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011696
11697static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011698do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011699{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011700 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011701
Benjamin Peterson14339b62009-01-31 16:36:08 +000011702 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11703 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011704
Benjamin Peterson14339b62009-01-31 16:36:08 +000011705 if (sep != NULL && sep != Py_None) {
11706 if (PyUnicode_Check(sep))
11707 return _PyUnicode_XStrip(self, striptype, sep);
11708 else {
11709 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011710 "%s arg must be None or str",
11711 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011712 return NULL;
11713 }
11714 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011715
Benjamin Peterson14339b62009-01-31 16:36:08 +000011716 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011717}
11718
11719
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011720PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011721 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011722\n\
11723Return a copy of the string S with leading and trailing\n\
11724whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011725If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011726
11727static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011728unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011729{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011730 if (PyTuple_GET_SIZE(args) == 0)
11731 return do_strip(self, BOTHSTRIP); /* Common case */
11732 else
11733 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011734}
11735
11736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011737PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011738 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011739\n\
11740Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011741If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011742
11743static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011744unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011745{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011746 if (PyTuple_GET_SIZE(args) == 0)
11747 return do_strip(self, LEFTSTRIP); /* Common case */
11748 else
11749 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011750}
11751
11752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011753PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011754 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011755\n\
11756Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011757If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011758
11759static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011760unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011761{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011762 if (PyTuple_GET_SIZE(args) == 0)
11763 return do_strip(self, RIGHTSTRIP); /* Common case */
11764 else
11765 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011766}
11767
11768
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011770unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011772 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774
Georg Brandl222de0f2009-04-12 12:01:50 +000011775 if (len < 1) {
11776 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011777 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011778 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779
Victor Stinnerc4b49542011-12-11 22:44:26 +010011780 /* no repeat, return original string */
11781 if (len == 1)
11782 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011783
Benjamin Petersonbac79492012-01-14 13:34:47 -050011784 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 return NULL;
11786
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011787 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011788 PyErr_SetString(PyExc_OverflowError,
11789 "repeated string is too long");
11790 return NULL;
11791 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011793
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011794 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795 if (!u)
11796 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011797 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 if (PyUnicode_GET_LENGTH(str) == 1) {
11800 const int kind = PyUnicode_KIND(str);
11801 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011802 if (kind == PyUnicode_1BYTE_KIND) {
11803 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011804 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011805 }
11806 else if (kind == PyUnicode_2BYTE_KIND) {
11807 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011808 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011809 ucs2[n] = fill_char;
11810 } else {
11811 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11812 assert(kind == PyUnicode_4BYTE_KIND);
11813 for (n = 0; n < len; ++n)
11814 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011815 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 }
11817 else {
11818 /* number of characters copied this far */
11819 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011820 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 char *to = (char *) PyUnicode_DATA(u);
11822 Py_MEMCPY(to, PyUnicode_DATA(str),
11823 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011824 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011825 n = (done <= nchars-done) ? done : nchars-done;
11826 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011827 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011828 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829 }
11830
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011831 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011832 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833}
11834
Alexander Belopolsky40018472011-02-26 01:02:56 +000011835PyObject *
11836PyUnicode_Replace(PyObject *obj,
11837 PyObject *subobj,
11838 PyObject *replobj,
11839 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840{
11841 PyObject *self;
11842 PyObject *str1;
11843 PyObject *str2;
11844 PyObject *result;
11845
11846 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011847 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011850 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011851 Py_DECREF(self);
11852 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853 }
11854 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011855 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011856 Py_DECREF(self);
11857 Py_DECREF(str1);
11858 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011860 if (PyUnicode_READY(self) == -1 ||
11861 PyUnicode_READY(str1) == -1 ||
11862 PyUnicode_READY(str2) == -1)
11863 result = NULL;
11864 else
11865 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866 Py_DECREF(self);
11867 Py_DECREF(str1);
11868 Py_DECREF(str2);
11869 return result;
11870}
11871
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011872PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011873 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011874\n\
11875Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011876old replaced by new. If the optional argument count is\n\
11877given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878
11879static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011880unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 PyObject *str1;
11883 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011884 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885 PyObject *result;
11886
Martin v. Löwis18e16552006-02-15 17:27:45 +000011887 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011889 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011890 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011892 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 return NULL;
11894 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011895 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011896 Py_DECREF(str1);
11897 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011898 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011899 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11900 result = NULL;
11901 else
11902 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903
11904 Py_DECREF(str1);
11905 Py_DECREF(str2);
11906 return result;
11907}
11908
Alexander Belopolsky40018472011-02-26 01:02:56 +000011909static PyObject *
11910unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011912 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 Py_ssize_t isize;
11914 Py_ssize_t osize, squote, dquote, i, o;
11915 Py_UCS4 max, quote;
11916 int ikind, okind;
11917 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011920 return NULL;
11921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 isize = PyUnicode_GET_LENGTH(unicode);
11923 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 /* Compute length of output, quote characters, and
11926 maximum character */
11927 osize = 2; /* quotes */
11928 max = 127;
11929 squote = dquote = 0;
11930 ikind = PyUnicode_KIND(unicode);
11931 for (i = 0; i < isize; i++) {
11932 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11933 switch (ch) {
11934 case '\'': squote++; osize++; break;
11935 case '"': dquote++; osize++; break;
11936 case '\\': case '\t': case '\r': case '\n':
11937 osize += 2; break;
11938 default:
11939 /* Fast-path ASCII */
11940 if (ch < ' ' || ch == 0x7f)
11941 osize += 4; /* \xHH */
11942 else if (ch < 0x7f)
11943 osize++;
11944 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11945 osize++;
11946 max = ch > max ? ch : max;
11947 }
11948 else if (ch < 0x100)
11949 osize += 4; /* \xHH */
11950 else if (ch < 0x10000)
11951 osize += 6; /* \uHHHH */
11952 else
11953 osize += 10; /* \uHHHHHHHH */
11954 }
11955 }
11956
11957 quote = '\'';
11958 if (squote) {
11959 if (dquote)
11960 /* Both squote and dquote present. Use squote,
11961 and escape them */
11962 osize += squote;
11963 else
11964 quote = '"';
11965 }
11966
11967 repr = PyUnicode_New(osize, max);
11968 if (repr == NULL)
11969 return NULL;
11970 okind = PyUnicode_KIND(repr);
11971 odata = PyUnicode_DATA(repr);
11972
11973 PyUnicode_WRITE(okind, odata, 0, quote);
11974 PyUnicode_WRITE(okind, odata, osize-1, quote);
11975
11976 for (i = 0, o = 1; i < isize; i++) {
11977 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011978
11979 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980 if ((ch == quote) || (ch == '\\')) {
11981 PyUnicode_WRITE(okind, odata, o++, '\\');
11982 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011983 continue;
11984 }
11985
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011987 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011988 PyUnicode_WRITE(okind, odata, o++, '\\');
11989 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011990 }
11991 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992 PyUnicode_WRITE(okind, odata, o++, '\\');
11993 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011994 }
11995 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 PyUnicode_WRITE(okind, odata, o++, '\\');
11997 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011998 }
11999
12000 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012001 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 PyUnicode_WRITE(okind, odata, o++, '\\');
12003 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012004 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12005 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012006 }
12007
Georg Brandl559e5d72008-06-11 18:37:52 +000012008 /* Copy ASCII characters as-is */
12009 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012011 }
12012
Benjamin Peterson29060642009-01-31 22:14:21 +000012013 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012014 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012015 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012016 (categories Z* and C* except ASCII space)
12017 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012019 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 if (ch <= 0xff) {
12021 PyUnicode_WRITE(okind, odata, o++, '\\');
12022 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012023 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12024 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012025 }
12026 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 else if (ch >= 0x10000) {
12028 PyUnicode_WRITE(okind, odata, o++, '\\');
12029 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012030 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12031 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12032 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12033 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12034 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12035 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12036 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12037 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012038 }
12039 /* Map 16-bit characters to '\uxxxx' */
12040 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 PyUnicode_WRITE(okind, odata, o++, '\\');
12042 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012043 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12044 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12045 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12046 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012047 }
12048 }
12049 /* Copy characters as-is */
12050 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012052 }
12053 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012054 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012056 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012057 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058}
12059
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012060PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012061 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062\n\
12063Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012064such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065arguments start and end are interpreted as in slice notation.\n\
12066\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012067Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068
12069static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012072 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012073 Py_ssize_t start;
12074 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012075 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076
Jesus Ceaac451502011-04-20 17:09:23 +020012077 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12078 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012079 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 if (PyUnicode_READY(self) == -1)
12082 return NULL;
12083 if (PyUnicode_READY(substring) == -1)
12084 return NULL;
12085
Victor Stinner7931d9a2011-11-04 00:22:48 +010012086 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087
12088 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 if (result == -2)
12091 return NULL;
12092
Christian Heimes217cfd12007-12-02 14:31:20 +000012093 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094}
12095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012096PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012097 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012099Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012100
12101static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012104 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012105 Py_ssize_t start;
12106 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012107 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108
Jesus Ceaac451502011-04-20 17:09:23 +020012109 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12110 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012111 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 if (PyUnicode_READY(self) == -1)
12114 return NULL;
12115 if (PyUnicode_READY(substring) == -1)
12116 return NULL;
12117
Victor Stinner7931d9a2011-11-04 00:22:48 +010012118 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012119
12120 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 if (result == -2)
12123 return NULL;
12124
Guido van Rossumd57fd912000-03-10 22:53:23 +000012125 if (result < 0) {
12126 PyErr_SetString(PyExc_ValueError, "substring not found");
12127 return NULL;
12128 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129
Christian Heimes217cfd12007-12-02 14:31:20 +000012130 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131}
12132
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012133PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012134 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012135\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012136Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012137done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012138
12139static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012140unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012142 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 Py_UCS4 fillchar = ' ';
12144
Victor Stinnere9a29352011-10-01 02:14:59 +020012145 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012146 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012147
Benjamin Petersonbac79492012-01-14 13:34:47 -050012148 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149 return NULL;
12150
Victor Stinnerc4b49542011-12-11 22:44:26 +010012151 if (PyUnicode_GET_LENGTH(self) >= width)
12152 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153
Victor Stinnerc4b49542011-12-11 22:44:26 +010012154 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155}
12156
Alexander Belopolsky40018472011-02-26 01:02:56 +000012157PyObject *
12158PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159{
12160 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012161
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162 s = PyUnicode_FromObject(s);
12163 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012164 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012165 if (sep != NULL) {
12166 sep = PyUnicode_FromObject(sep);
12167 if (sep == NULL) {
12168 Py_DECREF(s);
12169 return NULL;
12170 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171 }
12172
Victor Stinner9310abb2011-10-05 00:59:23 +020012173 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174
12175 Py_DECREF(s);
12176 Py_XDECREF(sep);
12177 return result;
12178}
12179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012180PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012181 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182\n\
12183Return a list of the words in S, using sep as the\n\
12184delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012185splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012186whitespace string is a separator and empty strings are\n\
12187removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188
12189static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012190unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012192 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012194 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012196 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12197 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198 return NULL;
12199
12200 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012201 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012202 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012203 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012205 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206}
12207
Thomas Wouters477c8d52006-05-27 19:21:47 +000012208PyObject *
12209PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12210{
12211 PyObject* str_obj;
12212 PyObject* sep_obj;
12213 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 int kind1, kind2, kind;
12215 void *buf1 = NULL, *buf2 = NULL;
12216 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012217
12218 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012219 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012220 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012221 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012222 if (!sep_obj) {
12223 Py_DECREF(str_obj);
12224 return NULL;
12225 }
12226 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12227 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012228 Py_DECREF(str_obj);
12229 return NULL;
12230 }
12231
Victor Stinner14f8f022011-10-05 20:58:25 +020012232 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012233 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012234 kind = Py_MAX(kind1, kind2);
12235 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012236 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012237 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238 if (!buf1)
12239 goto onError;
12240 buf2 = PyUnicode_DATA(sep_obj);
12241 if (kind2 != kind)
12242 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12243 if (!buf2)
12244 goto onError;
12245 len1 = PyUnicode_GET_LENGTH(str_obj);
12246 len2 = PyUnicode_GET_LENGTH(sep_obj);
12247
Benjamin Petersonead6b532011-12-20 17:23:42 -060012248 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012250 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12251 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12252 else
12253 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254 break;
12255 case PyUnicode_2BYTE_KIND:
12256 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12257 break;
12258 case PyUnicode_4BYTE_KIND:
12259 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12260 break;
12261 default:
12262 assert(0);
12263 out = 0;
12264 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012265
12266 Py_DECREF(sep_obj);
12267 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012268 if (kind1 != kind)
12269 PyMem_Free(buf1);
12270 if (kind2 != kind)
12271 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012272
12273 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 onError:
12275 Py_DECREF(sep_obj);
12276 Py_DECREF(str_obj);
12277 if (kind1 != kind && buf1)
12278 PyMem_Free(buf1);
12279 if (kind2 != kind && buf2)
12280 PyMem_Free(buf2);
12281 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012282}
12283
12284
12285PyObject *
12286PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12287{
12288 PyObject* str_obj;
12289 PyObject* sep_obj;
12290 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 int kind1, kind2, kind;
12292 void *buf1 = NULL, *buf2 = NULL;
12293 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012294
12295 str_obj = PyUnicode_FromObject(str_in);
12296 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012297 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012298 sep_obj = PyUnicode_FromObject(sep_in);
12299 if (!sep_obj) {
12300 Py_DECREF(str_obj);
12301 return NULL;
12302 }
12303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 kind1 = PyUnicode_KIND(str_in);
12305 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012306 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012307 buf1 = PyUnicode_DATA(str_in);
12308 if (kind1 != kind)
12309 buf1 = _PyUnicode_AsKind(str_in, kind);
12310 if (!buf1)
12311 goto onError;
12312 buf2 = PyUnicode_DATA(sep_obj);
12313 if (kind2 != kind)
12314 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12315 if (!buf2)
12316 goto onError;
12317 len1 = PyUnicode_GET_LENGTH(str_obj);
12318 len2 = PyUnicode_GET_LENGTH(sep_obj);
12319
Benjamin Petersonead6b532011-12-20 17:23:42 -060012320 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012321 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012322 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12323 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12324 else
12325 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 break;
12327 case PyUnicode_2BYTE_KIND:
12328 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12329 break;
12330 case PyUnicode_4BYTE_KIND:
12331 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12332 break;
12333 default:
12334 assert(0);
12335 out = 0;
12336 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012337
12338 Py_DECREF(sep_obj);
12339 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 if (kind1 != kind)
12341 PyMem_Free(buf1);
12342 if (kind2 != kind)
12343 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012344
12345 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 onError:
12347 Py_DECREF(sep_obj);
12348 Py_DECREF(str_obj);
12349 if (kind1 != kind && buf1)
12350 PyMem_Free(buf1);
12351 if (kind2 != kind && buf2)
12352 PyMem_Free(buf2);
12353 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012354}
12355
12356PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012357 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012358\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012359Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012360the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012361found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012362
12363static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012364unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012365{
Victor Stinner9310abb2011-10-05 00:59:23 +020012366 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012367}
12368
12369PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012370 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012371\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012372Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012373the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012374separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012375
12376static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012377unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012378{
Victor Stinner9310abb2011-10-05 00:59:23 +020012379 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012380}
12381
Alexander Belopolsky40018472011-02-26 01:02:56 +000012382PyObject *
12383PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012384{
12385 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012386
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012387 s = PyUnicode_FromObject(s);
12388 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012389 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012390 if (sep != NULL) {
12391 sep = PyUnicode_FromObject(sep);
12392 if (sep == NULL) {
12393 Py_DECREF(s);
12394 return NULL;
12395 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012396 }
12397
Victor Stinner9310abb2011-10-05 00:59:23 +020012398 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012399
12400 Py_DECREF(s);
12401 Py_XDECREF(sep);
12402 return result;
12403}
12404
12405PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012406 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012407\n\
12408Return a list of the words in S, using sep as the\n\
12409delimiter string, starting at the end of the string and\n\
12410working to the front. If maxsplit is given, at most maxsplit\n\
12411splits are done. If sep is not specified, any whitespace string\n\
12412is a separator.");
12413
12414static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012415unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012416{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012417 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012418 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012419 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012420
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012421 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12422 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012423 return NULL;
12424
12425 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012426 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012427 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012428 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012429 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012430 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012431}
12432
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012433PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012434 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012435\n\
12436Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012437Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012438is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012439
12440static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012441unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012442{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012443 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012444 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012446 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12447 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012448 return NULL;
12449
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012450 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012451}
12452
12453static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012454PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012455{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012456 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457}
12458
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012459PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012460 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461\n\
12462Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012463and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012464
12465static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012466unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012468 if (PyUnicode_READY(self) == -1)
12469 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012470 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012471}
12472
Georg Brandlceee0772007-11-27 23:48:05 +000012473PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012474 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012475\n\
12476Return a translation table usable for str.translate().\n\
12477If there is only one argument, it must be a dictionary mapping Unicode\n\
12478ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012479Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012480If there are two arguments, they must be strings of equal length, and\n\
12481in the resulting dictionary, each character in x will be mapped to the\n\
12482character at the same position in y. If there is a third argument, it\n\
12483must be a string, whose characters will be mapped to None in the result.");
12484
12485static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012486unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012487{
12488 PyObject *x, *y = NULL, *z = NULL;
12489 PyObject *new = NULL, *key, *value;
12490 Py_ssize_t i = 0;
12491 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012492
Georg Brandlceee0772007-11-27 23:48:05 +000012493 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12494 return NULL;
12495 new = PyDict_New();
12496 if (!new)
12497 return NULL;
12498 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499 int x_kind, y_kind, z_kind;
12500 void *x_data, *y_data, *z_data;
12501
Georg Brandlceee0772007-11-27 23:48:05 +000012502 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012503 if (!PyUnicode_Check(x)) {
12504 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12505 "be a string if there is a second argument");
12506 goto err;
12507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012508 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012509 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12510 "arguments must have equal length");
12511 goto err;
12512 }
12513 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 x_kind = PyUnicode_KIND(x);
12515 y_kind = PyUnicode_KIND(y);
12516 x_data = PyUnicode_DATA(x);
12517 y_data = PyUnicode_DATA(y);
12518 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12519 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012520 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012521 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012522 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012523 if (!value) {
12524 Py_DECREF(key);
12525 goto err;
12526 }
Georg Brandlceee0772007-11-27 23:48:05 +000012527 res = PyDict_SetItem(new, key, value);
12528 Py_DECREF(key);
12529 Py_DECREF(value);
12530 if (res < 0)
12531 goto err;
12532 }
12533 /* create entries for deleting chars in z */
12534 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012535 z_kind = PyUnicode_KIND(z);
12536 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012537 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012538 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012539 if (!key)
12540 goto err;
12541 res = PyDict_SetItem(new, key, Py_None);
12542 Py_DECREF(key);
12543 if (res < 0)
12544 goto err;
12545 }
12546 }
12547 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 int kind;
12549 void *data;
12550
Georg Brandlceee0772007-11-27 23:48:05 +000012551 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012552 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012553 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12554 "to maketrans it must be a dict");
12555 goto err;
12556 }
12557 /* copy entries into the new dict, converting string keys to int keys */
12558 while (PyDict_Next(x, &i, &key, &value)) {
12559 if (PyUnicode_Check(key)) {
12560 /* convert string keys to integer keys */
12561 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012562 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012563 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12564 "table must be of length 1");
12565 goto err;
12566 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567 kind = PyUnicode_KIND(key);
12568 data = PyUnicode_DATA(key);
12569 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012570 if (!newkey)
12571 goto err;
12572 res = PyDict_SetItem(new, newkey, value);
12573 Py_DECREF(newkey);
12574 if (res < 0)
12575 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012576 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012577 /* just keep integer keys */
12578 if (PyDict_SetItem(new, key, value) < 0)
12579 goto err;
12580 } else {
12581 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12582 "be strings or integers");
12583 goto err;
12584 }
12585 }
12586 }
12587 return new;
12588 err:
12589 Py_DECREF(new);
12590 return NULL;
12591}
12592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012593PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012594 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012595\n\
12596Return a copy of the string S, where all characters have been mapped\n\
12597through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012598Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012599Unmapped characters are left untouched. Characters mapped to None\n\
12600are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012601
12602static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012603unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012604{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012606}
12607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012608PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012609 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012611Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012612
12613static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012614unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012615{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012616 if (PyUnicode_READY(self) == -1)
12617 return NULL;
12618 if (PyUnicode_IS_ASCII(self))
12619 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012620 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621}
12622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012623PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012624 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012626Pad a numeric string S with zeros on the left, to fill a field\n\
12627of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628
12629static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012630unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012632 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012633 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012634 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012635 int kind;
12636 void *data;
12637 Py_UCS4 chr;
12638
Martin v. Löwis18e16552006-02-15 17:27:45 +000012639 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640 return NULL;
12641
Benjamin Petersonbac79492012-01-14 13:34:47 -050012642 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644
Victor Stinnerc4b49542011-12-11 22:44:26 +010012645 if (PyUnicode_GET_LENGTH(self) >= width)
12646 return unicode_result_unchanged(self);
12647
12648 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012649
12650 u = pad(self, fill, 0, '0');
12651
Walter Dörwald068325e2002-04-15 13:36:47 +000012652 if (u == NULL)
12653 return NULL;
12654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 kind = PyUnicode_KIND(u);
12656 data = PyUnicode_DATA(u);
12657 chr = PyUnicode_READ(kind, data, fill);
12658
12659 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 PyUnicode_WRITE(kind, data, 0, chr);
12662 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663 }
12664
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012665 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012666 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012668
12669#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012670static PyObject *
12671unicode__decimal2ascii(PyObject *self)
12672{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012674}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675#endif
12676
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012677PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012678 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012680Return True if S starts with the specified prefix, False otherwise.\n\
12681With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012682With optional end, stop comparing S at that position.\n\
12683prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684
12685static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012686unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012687 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012689 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012690 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012691 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012692 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012693 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694
Jesus Ceaac451502011-04-20 17:09:23 +020012695 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012696 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012697 if (PyTuple_Check(subobj)) {
12698 Py_ssize_t i;
12699 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012700 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012701 if (substring == NULL)
12702 return NULL;
12703 result = tailmatch(self, substring, start, end, -1);
12704 Py_DECREF(substring);
12705 if (result) {
12706 Py_RETURN_TRUE;
12707 }
12708 }
12709 /* nothing matched */
12710 Py_RETURN_FALSE;
12711 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012712 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012713 if (substring == NULL) {
12714 if (PyErr_ExceptionMatches(PyExc_TypeError))
12715 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12716 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012717 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012718 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012719 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012721 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722}
12723
12724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012725PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012726 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012727\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012728Return True if S ends with the specified suffix, False otherwise.\n\
12729With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012730With optional end, stop comparing S at that position.\n\
12731suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012732
12733static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012734unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012735 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012736{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012737 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012738 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012739 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012740 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012741 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012742
Jesus Ceaac451502011-04-20 17:09:23 +020012743 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012744 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012745 if (PyTuple_Check(subobj)) {
12746 Py_ssize_t i;
12747 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012748 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012749 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012750 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012751 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012752 result = tailmatch(self, substring, start, end, +1);
12753 Py_DECREF(substring);
12754 if (result) {
12755 Py_RETURN_TRUE;
12756 }
12757 }
12758 Py_RETURN_FALSE;
12759 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012760 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012761 if (substring == NULL) {
12762 if (PyErr_ExceptionMatches(PyExc_TypeError))
12763 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12764 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012765 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012766 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012767 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012768 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012769 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012770}
12771
Victor Stinner202fdca2012-05-07 12:47:02 +020012772typedef struct {
12773 PyObject *buffer;
12774 void *data;
12775 enum PyUnicode_Kind kind;
12776 Py_UCS4 maxchar;
12777 Py_ssize_t pos;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012778} _PyUnicodeWriter ;
Victor Stinner202fdca2012-05-07 12:47:02 +020012779
12780Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012781_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012782{
12783 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12784 writer->data = PyUnicode_DATA(writer->buffer);
12785 writer->kind = PyUnicode_KIND(writer->buffer);
12786}
12787
12788Py_LOCAL(int)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012789_PyUnicodeWriter_Init(_PyUnicodeWriter *writer,
Victor Stinner202fdca2012-05-07 12:47:02 +020012790 Py_ssize_t length, Py_UCS4 maxchar)
12791{
12792 writer->pos = 0;
12793 writer->buffer = PyUnicode_New(length, maxchar);
12794 if (writer->buffer == NULL)
12795 return -1;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012796 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012797 return 0;
12798}
12799
12800Py_LOCAL_INLINE(int)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012801_PyUnicodeWriter_Prepare(_PyUnicodeWriter *writer,
Victor Stinner202fdca2012-05-07 12:47:02 +020012802 Py_ssize_t length, Py_UCS4 maxchar)
12803{
12804 Py_ssize_t newlen;
12805 PyObject *newbuffer;
12806
12807 if (length > PY_SSIZE_T_MAX - writer->pos) {
12808 PyErr_NoMemory();
12809 return -1;
12810 }
12811 newlen = writer->pos + length;
12812
12813 if (newlen > PyUnicode_GET_LENGTH(writer->buffer)) {
Victor Stinner10680252012-05-07 23:50:05 +020012814 /* overallocate 25% to limit the number of resize */
12815 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
Victor Stinner202fdca2012-05-07 12:47:02 +020012816 newlen += newlen / 4;
12817
12818 if (maxchar > writer->maxchar) {
12819 /* resize + widen */
12820 newbuffer = PyUnicode_New(newlen, maxchar);
12821 if (newbuffer == NULL)
12822 return -1;
12823 PyUnicode_CopyCharacters(newbuffer, 0,
12824 writer->buffer, 0, writer->pos);
12825 Py_DECREF(writer->buffer);
12826 }
12827 else {
12828 newbuffer = resize_compact(writer->buffer, newlen);
12829 if (newbuffer == NULL)
12830 return -1;
12831 }
12832 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012833 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012834 }
12835 else if (maxchar > writer->maxchar) {
12836 if (unicode_widen(&writer->buffer, writer->pos, maxchar) < 0)
12837 return -1;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012838 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012839 }
12840 return 0;
12841}
12842
Victor Stinner202fdca2012-05-07 12:47:02 +020012843Py_LOCAL(PyObject *)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012844_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012845{
12846 if (PyUnicode_Resize(&writer->buffer, writer->pos) < 0) {
12847 Py_DECREF(writer->buffer);
12848 return NULL;
12849 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012850 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012851 return writer->buffer;
12852}
12853
12854Py_LOCAL(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012855_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012856{
12857 Py_CLEAR(writer->buffer);
12858}
12859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012860#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012861
12862PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012863 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012864\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012865Return a formatted version of S, using substitutions from args and kwargs.\n\
12866The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012867
Eric Smith27bbca62010-11-04 17:06:58 +000012868PyDoc_STRVAR(format_map__doc__,
12869 "S.format_map(mapping) -> str\n\
12870\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012871Return a formatted version of S, using substitutions from mapping.\n\
12872The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012873
Eric Smith4a7d76d2008-05-30 18:10:19 +000012874static PyObject *
12875unicode__format__(PyObject* self, PyObject* args)
12876{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012877 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012878
12879 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12880 return NULL;
12881
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012882 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012883 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012884 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012885}
12886
Eric Smith8c663262007-08-25 02:26:07 +000012887PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012888 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012889\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012890Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012891
12892static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012893unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012894{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012895 Py_ssize_t size;
12896
12897 /* If it's a compact object, account for base structure +
12898 character data. */
12899 if (PyUnicode_IS_COMPACT_ASCII(v))
12900 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12901 else if (PyUnicode_IS_COMPACT(v))
12902 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012903 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904 else {
12905 /* If it is a two-block object, account for base object, and
12906 for character block if present. */
12907 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012908 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012909 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012910 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012911 }
12912 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012913 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012914 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012915 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012916 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012917 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012918
12919 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012920}
12921
12922PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012923 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012924
12925static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012926unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012927{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010012928 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012929 if (!copy)
12930 return NULL;
12931 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012932}
12933
Guido van Rossumd57fd912000-03-10 22:53:23 +000012934static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012935 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012936 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012937 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
12938 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012939 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12940 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050012941 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012942 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12943 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12944 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12945 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12946 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012947 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012948 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12949 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12950 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012951 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012952 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12953 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12954 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012955 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012956 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012957 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012958 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012959 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12960 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12961 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12962 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12963 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12964 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12965 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12966 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12967 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12968 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12969 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12970 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12971 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12972 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012973 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012974 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012975 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012976 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012977 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012978 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012979 {"maketrans", (PyCFunction) unicode_maketrans,
12980 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012981 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012982#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012983 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012984 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012985#endif
12986
Benjamin Peterson14339b62009-01-31 16:36:08 +000012987 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012988 {NULL, NULL}
12989};
12990
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012991static PyObject *
12992unicode_mod(PyObject *v, PyObject *w)
12993{
Brian Curtindfc80e32011-08-10 20:28:54 -050012994 if (!PyUnicode_Check(v))
12995 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012996 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012997}
12998
12999static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013000 0, /*nb_add*/
13001 0, /*nb_subtract*/
13002 0, /*nb_multiply*/
13003 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013004};
13005
Guido van Rossumd57fd912000-03-10 22:53:23 +000013006static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013007 (lenfunc) unicode_length, /* sq_length */
13008 PyUnicode_Concat, /* sq_concat */
13009 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13010 (ssizeargfunc) unicode_getitem, /* sq_item */
13011 0, /* sq_slice */
13012 0, /* sq_ass_item */
13013 0, /* sq_ass_slice */
13014 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013015};
13016
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013017static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013018unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013019{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013020 if (PyUnicode_READY(self) == -1)
13021 return NULL;
13022
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013023 if (PyIndex_Check(item)) {
13024 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013025 if (i == -1 && PyErr_Occurred())
13026 return NULL;
13027 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013028 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013029 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013030 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013031 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013032 PyObject *result;
13033 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013034 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013035 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013037 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013038 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013039 return NULL;
13040 }
13041
13042 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013043 Py_INCREF(unicode_empty);
13044 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013045 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013046 slicelength == PyUnicode_GET_LENGTH(self)) {
13047 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013048 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013049 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013050 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013051 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013052 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013053 src_kind = PyUnicode_KIND(self);
13054 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013055 if (!PyUnicode_IS_ASCII(self)) {
13056 kind_limit = kind_maxchar_limit(src_kind);
13057 max_char = 0;
13058 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13059 ch = PyUnicode_READ(src_kind, src_data, cur);
13060 if (ch > max_char) {
13061 max_char = ch;
13062 if (max_char >= kind_limit)
13063 break;
13064 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013065 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013066 }
Victor Stinner55c99112011-10-13 01:17:06 +020013067 else
13068 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013069 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013070 if (result == NULL)
13071 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013072 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013073 dest_data = PyUnicode_DATA(result);
13074
13075 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013076 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13077 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013078 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013079 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013080 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013081 } else {
13082 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13083 return NULL;
13084 }
13085}
13086
13087static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013088 (lenfunc)unicode_length, /* mp_length */
13089 (binaryfunc)unicode_subscript, /* mp_subscript */
13090 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013091};
13092
Guido van Rossumd57fd912000-03-10 22:53:23 +000013093
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094/* Helpers for PyUnicode_Format() */
13095
13096static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013097getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013099 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013100 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013101 (*p_argidx)++;
13102 if (arglen < 0)
13103 return args;
13104 else
13105 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106 }
13107 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013108 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109 return NULL;
13110}
13111
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013112/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013114static PyObject *
13115formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013117 char *p;
13118 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013120
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121 x = PyFloat_AsDouble(v);
13122 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013123 return NULL;
13124
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013126 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013127
Eric Smith0923d1d2009-04-16 20:16:10 +000013128 p = PyOS_double_to_string(x, type, prec,
13129 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013130 if (p == NULL)
13131 return NULL;
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013132 result = unicode_fromascii((unsigned char*)p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +000013133 PyMem_Free(p);
13134 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135}
13136
Victor Stinnerd0880d52012-04-27 23:40:13 +020013137/* formatlong() emulates the format codes d, u, o, x and X, and
13138 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13139 * Python's regular ints.
13140 * Return value: a new PyUnicodeObject*, or NULL if error.
13141 * The output string is of the form
13142 * "-"? ("0x" | "0X")? digit+
13143 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13144 * set in flags. The case of hex digits will be correct,
13145 * There will be at least prec digits, zero-filled on the left if
13146 * necessary to get that many.
13147 * val object to be converted
13148 * flags bitmask of format flags; only F_ALT is looked at
13149 * prec minimum number of digits; 0-fill on left if needed
13150 * type a character in [duoxX]; u acts the same as d
13151 *
13152 * CAUTION: o, x and X conversions on regular ints can never
13153 * produce a '-' sign, but can for Python's unbounded ints.
13154 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013155static PyObject*
13156formatlong(PyObject *val, int flags, int prec, int type)
13157{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013158 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013159 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013160 Py_ssize_t i;
13161 int sign; /* 1 if '-', else 0 */
13162 int len; /* number of characters */
13163 Py_ssize_t llen;
13164 int numdigits; /* len == numnondigits + numdigits */
13165 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013166
Victor Stinnerd0880d52012-04-27 23:40:13 +020013167 /* Avoid exceeding SSIZE_T_MAX */
13168 if (prec > INT_MAX-3) {
13169 PyErr_SetString(PyExc_OverflowError,
13170 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013171 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013172 }
13173
13174 assert(PyLong_Check(val));
13175
13176 switch (type) {
13177 case 'd':
13178 case 'u':
13179 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013180 if (PyBool_Check(val))
13181 result = PyNumber_ToBase(val, 10);
13182 else
13183 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013184 break;
13185 case 'o':
13186 numnondigits = 2;
13187 result = PyNumber_ToBase(val, 8);
13188 break;
13189 case 'x':
13190 case 'X':
13191 numnondigits = 2;
13192 result = PyNumber_ToBase(val, 16);
13193 break;
13194 default:
13195 assert(!"'type' not in [duoxX]");
13196 }
13197 if (!result)
13198 return NULL;
13199
13200 assert(unicode_modifiable(result));
13201 assert(PyUnicode_IS_READY(result));
13202 assert(PyUnicode_IS_ASCII(result));
13203
13204 /* To modify the string in-place, there can only be one reference. */
13205 if (Py_REFCNT(result) != 1) {
13206 PyErr_BadInternalCall();
13207 return NULL;
13208 }
13209 buf = PyUnicode_DATA(result);
13210 llen = PyUnicode_GET_LENGTH(result);
13211 if (llen > INT_MAX) {
13212 PyErr_SetString(PyExc_ValueError,
13213 "string too large in _PyBytes_FormatLong");
13214 return NULL;
13215 }
13216 len = (int)llen;
13217 sign = buf[0] == '-';
13218 numnondigits += sign;
13219 numdigits = len - numnondigits;
13220 assert(numdigits > 0);
13221
13222 /* Get rid of base marker unless F_ALT */
13223 if (((flags & F_ALT) == 0 &&
13224 (type == 'o' || type == 'x' || type == 'X'))) {
13225 assert(buf[sign] == '0');
13226 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13227 buf[sign+1] == 'o');
13228 numnondigits -= 2;
13229 buf += 2;
13230 len -= 2;
13231 if (sign)
13232 buf[0] = '-';
13233 assert(len == numnondigits + numdigits);
13234 assert(numdigits > 0);
13235 }
13236
13237 /* Fill with leading zeroes to meet minimum width. */
13238 if (prec > numdigits) {
13239 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13240 numnondigits + prec);
13241 char *b1;
13242 if (!r1) {
13243 Py_DECREF(result);
13244 return NULL;
13245 }
13246 b1 = PyBytes_AS_STRING(r1);
13247 for (i = 0; i < numnondigits; ++i)
13248 *b1++ = *buf++;
13249 for (i = 0; i < prec - numdigits; i++)
13250 *b1++ = '0';
13251 for (i = 0; i < numdigits; i++)
13252 *b1++ = *buf++;
13253 *b1 = '\0';
13254 Py_DECREF(result);
13255 result = r1;
13256 buf = PyBytes_AS_STRING(result);
13257 len = numnondigits + prec;
13258 }
13259
13260 /* Fix up case for hex conversions. */
13261 if (type == 'X') {
13262 /* Need to convert all lower case letters to upper case.
13263 and need to convert 0x to 0X (and -0x to -0X). */
13264 for (i = 0; i < len; i++)
13265 if (buf[i] >= 'a' && buf[i] <= 'x')
13266 buf[i] -= 'a'-'A';
13267 }
13268 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13269 PyObject *unicode;
13270 unicode = unicode_fromascii((unsigned char *)buf, len);
13271 Py_DECREF(result);
13272 result = unicode;
13273 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013274 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013275}
13276
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013277static Py_UCS4
13278formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013279{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013280 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013281 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013282 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013283 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013284 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013285 goto onError;
13286 }
13287 else {
13288 /* Integer input truncated to a character */
13289 long x;
13290 x = PyLong_AsLong(v);
13291 if (x == -1 && PyErr_Occurred())
13292 goto onError;
13293
Victor Stinner8faf8212011-12-08 22:14:11 +010013294 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013295 PyErr_SetString(PyExc_OverflowError,
13296 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013297 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013298 }
13299
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013300 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013301 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013302
Benjamin Peterson29060642009-01-31 22:14:21 +000013303 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013304 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013305 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013306 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013307}
13308
Alexander Belopolsky40018472011-02-26 01:02:56 +000013309PyObject *
13310PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013311{
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013312 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013313 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013314 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013315 PyObject *temp = NULL;
13316 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013317 PyObject *uformat;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013318 void *fmt;
13319 enum PyUnicode_Kind kind, fmtkind;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013320 _PyUnicodeWriter writer;
Victor Stinneree4544c2012-05-09 22:24:08 +020013321 Py_ssize_t sublen;
13322 Py_UCS4 maxchar;
Tim Petersced69f82003-09-16 20:30:58 +000013323
Guido van Rossumd57fd912000-03-10 22:53:23 +000013324 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013325 PyErr_BadInternalCall();
13326 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013327 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013328 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013329 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013330 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013331 if (PyUnicode_READY(uformat) == -1)
13332 Py_DECREF(uformat);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013334 fmt = PyUnicode_DATA(uformat);
13335 fmtkind = PyUnicode_KIND(uformat);
13336 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13337 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013338
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013339 if (_PyUnicodeWriter_Init(&writer, fmtcnt + 100, 127) < 0)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013340 goto onError;
13341
Guido van Rossumd57fd912000-03-10 22:53:23 +000013342 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013343 arglen = PyTuple_Size(args);
13344 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013345 }
13346 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013347 arglen = -1;
13348 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013349 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013350 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013351 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013352 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353
13354 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013355 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013356 Py_ssize_t nonfmtpos;
13357 nonfmtpos = fmtpos++;
13358 while (fmtcnt >= 0 &&
13359 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13360 fmtpos++;
13361 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013362 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013363 if (fmtcnt < 0)
13364 fmtpos--;
Victor Stinneree4544c2012-05-09 22:24:08 +020013365 sublen = fmtpos - nonfmtpos;
13366 maxchar = _PyUnicode_FindMaxChar(uformat,
13367 nonfmtpos, nonfmtpos + sublen);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013368 if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013369 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013370
13371 copy_characters(writer.buffer, writer.pos,
13372 uformat, nonfmtpos, sublen);
13373 writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013374 }
13375 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013376 /* Got a format specifier */
13377 int flags = 0;
13378 Py_ssize_t width = -1;
13379 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013380 Py_UCS4 c = '\0';
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013381 Py_UCS4 fill;
13382 int sign;
13383 Py_UCS4 signchar;
Benjamin Peterson29060642009-01-31 22:14:21 +000013384 int isnumok;
13385 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013386 void *pbuf = NULL;
13387 Py_ssize_t pindex, len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013388 Py_UCS4 bufmaxchar;
13389 Py_ssize_t buflen;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013391 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013392 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13393 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013394 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013395 Py_ssize_t keylen;
13396 PyObject *key;
13397 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013398
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 if (dict == NULL) {
13400 PyErr_SetString(PyExc_TypeError,
13401 "format requires a mapping");
13402 goto onError;
13403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013404 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013405 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013406 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013407 /* Skip over balanced parentheses */
13408 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013409 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13410 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013411 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013412 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013413 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013414 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013415 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013416 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013417 if (fmtcnt < 0 || pcount > 0) {
13418 PyErr_SetString(PyExc_ValueError,
13419 "incomplete format key");
13420 goto onError;
13421 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013422 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013423 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013424 if (key == NULL)
13425 goto onError;
13426 if (args_owned) {
13427 Py_DECREF(args);
13428 args_owned = 0;
13429 }
13430 args = PyObject_GetItem(dict, key);
13431 Py_DECREF(key);
13432 if (args == NULL) {
13433 goto onError;
13434 }
13435 args_owned = 1;
13436 arglen = -1;
13437 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013438 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013439 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013440 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13441 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013442 case '-': flags |= F_LJUST; continue;
13443 case '+': flags |= F_SIGN; continue;
13444 case ' ': flags |= F_BLANK; continue;
13445 case '#': flags |= F_ALT; continue;
13446 case '0': flags |= F_ZERO; continue;
13447 }
13448 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013449 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013450 if (c == '*') {
13451 v = getnextarg(args, arglen, &argidx);
13452 if (v == NULL)
13453 goto onError;
13454 if (!PyLong_Check(v)) {
13455 PyErr_SetString(PyExc_TypeError,
13456 "* wants int");
13457 goto onError;
13458 }
13459 width = PyLong_AsLong(v);
13460 if (width == -1 && PyErr_Occurred())
13461 goto onError;
13462 if (width < 0) {
13463 flags |= F_LJUST;
13464 width = -width;
13465 }
13466 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013467 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013468 }
13469 else if (c >= '0' && c <= '9') {
13470 width = c - '0';
13471 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013472 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013473 if (c < '0' || c > '9')
13474 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013475 /* Since c is unsigned, the RHS would end up as unsigned,
13476 mixing signed and unsigned comparison. Since c is between
13477 '0' and '9', casting to int is safe. */
13478 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013479 PyErr_SetString(PyExc_ValueError,
13480 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013481 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013482 }
13483 width = width*10 + (c - '0');
13484 }
13485 }
13486 if (c == '.') {
13487 prec = 0;
13488 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013489 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013490 if (c == '*') {
13491 v = getnextarg(args, arglen, &argidx);
13492 if (v == NULL)
13493 goto onError;
13494 if (!PyLong_Check(v)) {
13495 PyErr_SetString(PyExc_TypeError,
13496 "* wants int");
13497 goto onError;
13498 }
13499 prec = PyLong_AsLong(v);
13500 if (prec == -1 && PyErr_Occurred())
13501 goto onError;
13502 if (prec < 0)
13503 prec = 0;
13504 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013505 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013506 }
13507 else if (c >= '0' && c <= '9') {
13508 prec = c - '0';
13509 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013510 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013511 if (c < '0' || c > '9')
13512 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013513 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013514 PyErr_SetString(PyExc_ValueError,
13515 "prec too big");
13516 goto onError;
13517 }
13518 prec = prec*10 + (c - '0');
13519 }
13520 }
13521 } /* prec */
13522 if (fmtcnt >= 0) {
13523 if (c == 'h' || c == 'l' || c == 'L') {
13524 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013525 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013526 }
13527 }
13528 if (fmtcnt < 0) {
13529 PyErr_SetString(PyExc_ValueError,
13530 "incomplete format");
13531 goto onError;
13532 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013533
13534 if (c == '%') {
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013535 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013536 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013537 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13538 writer.pos += 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013539 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013540 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013541
13542
13543 v = getnextarg(args, arglen, &argidx);
13544 if (v == NULL)
13545 goto onError;
13546
Benjamin Peterson29060642009-01-31 22:14:21 +000013547 sign = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013548 signchar = '\0';
Benjamin Peterson29060642009-01-31 22:14:21 +000013549 fill = ' ';
13550 switch (c) {
13551
Benjamin Peterson29060642009-01-31 22:14:21 +000013552 case 's':
13553 case 'r':
13554 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013555 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013556 temp = v;
13557 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013558 }
13559 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013560 if (c == 's')
13561 temp = PyObject_Str(v);
13562 else if (c == 'r')
13563 temp = PyObject_Repr(v);
13564 else
13565 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013566 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013567 break;
13568
13569 case 'i':
13570 case 'd':
13571 case 'u':
13572 case 'o':
13573 case 'x':
13574 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013575 isnumok = 0;
13576 if (PyNumber_Check(v)) {
13577 PyObject *iobj=NULL;
13578
13579 if (PyLong_Check(v)) {
13580 iobj = v;
13581 Py_INCREF(iobj);
13582 }
13583 else {
13584 iobj = PyNumber_Long(v);
13585 }
13586 if (iobj!=NULL) {
13587 if (PyLong_Check(iobj)) {
13588 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013589 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013590 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013591 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013592 }
13593 else {
13594 Py_DECREF(iobj);
13595 }
13596 }
13597 }
13598 if (!isnumok) {
13599 PyErr_Format(PyExc_TypeError,
13600 "%%%c format: a number is required, "
13601 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13602 goto onError;
13603 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013604 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013605 fill = '0';
13606 break;
13607
13608 case 'e':
13609 case 'E':
13610 case 'f':
13611 case 'F':
13612 case 'g':
13613 case 'G':
Benjamin Peterson29060642009-01-31 22:14:21 +000013614 sign = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013615 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013616 fill = '0';
Victor Stinneraff3cc62012-04-30 05:19:21 +020013617 temp = formatfloat(v, flags, prec, c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013618 break;
13619
13620 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013621 {
13622 Py_UCS4 ch = formatchar(v);
13623 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013624 goto onError;
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013625 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000013626 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013627 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013628
13629 default:
13630 PyErr_Format(PyExc_ValueError,
13631 "unsupported format character '%c' (0x%x) "
13632 "at index %zd",
13633 (31<=c && c<=126) ? (char)c : '?',
13634 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013635 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013636 goto onError;
13637 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013638 if (temp == NULL)
13639 goto onError;
13640 assert (PyUnicode_Check(temp));
13641 if (PyUnicode_READY(temp) == -1) {
13642 Py_CLEAR(temp);
13643 goto onError;
13644 }
13645 kind = PyUnicode_KIND(temp);
13646 pbuf = PyUnicode_DATA(temp);
13647 len = PyUnicode_GET_LENGTH(temp);
13648
13649 if (c == 's' || c == 'r' || c == 'a') {
13650 if (prec >= 0 && len > prec)
13651 len = prec;
13652 }
13653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013654 /* pbuf is initialized here. */
13655 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013656 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013657 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13658 if (ch == '-' || ch == '+') {
13659 signchar = ch;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013660 len--;
13661 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013662 }
13663 else if (flags & F_SIGN)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013664 signchar = '+';
Benjamin Peterson29060642009-01-31 22:14:21 +000013665 else if (flags & F_BLANK)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013666 signchar = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +000013667 else
13668 sign = 0;
13669 }
13670 if (width < len)
13671 width = len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013672
13673 /* Compute the length and maximum character of the
13674 written characters */
13675 bufmaxchar = 127;
13676 if (!(flags & F_LJUST)) {
13677 if (sign) {
13678 if ((width-1) > len)
13679 bufmaxchar = Py_MAX(bufmaxchar, fill);
13680 }
13681 else {
13682 if (width > len)
13683 bufmaxchar = Py_MAX(bufmaxchar, fill);
13684 }
13685 }
13686 maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
13687 bufmaxchar = Py_MAX(bufmaxchar, maxchar);
13688
13689 buflen = width;
13690 if (sign && len == width)
13691 buflen++;
13692
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013693 if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
Victor Stinneree4544c2012-05-09 22:24:08 +020013694 goto onError;
13695
13696 /* Write characters */
Benjamin Peterson29060642009-01-31 22:14:21 +000013697 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013698 if (fill != ' ') {
Victor Stinneree4544c2012-05-09 22:24:08 +020013699 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13700 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013701 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013702 if (width > len)
13703 width--;
13704 }
13705 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013706 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013707 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013708 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013709 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13710 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13711 writer.pos += 2;
13712 pindex += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +000013713 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013714 width -= 2;
13715 if (width < 0)
13716 width = 0;
13717 len -= 2;
13718 }
13719 if (width > len && !(flags & F_LJUST)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013720 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013721 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
13722 writer.pos += sublen;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013723 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013724 }
13725 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013726 if (sign) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013727 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13728 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013729 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013730 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013731 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13732 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013733 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13734 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13735 writer.pos += 2;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013736 pindex += 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013737 }
13738 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013739
Victor Stinneree4544c2012-05-09 22:24:08 +020013740 copy_characters(writer.buffer, writer.pos,
13741 temp, pindex, len);
13742 writer.pos += len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013743 if (width > len) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013744 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013745 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
13746 writer.pos += sublen;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013747 }
Victor Stinneree4544c2012-05-09 22:24:08 +020013748
Benjamin Peterson29060642009-01-31 22:14:21 +000013749 if (dict && (argidx < arglen) && c != '%') {
13750 PyErr_SetString(PyExc_TypeError,
13751 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013752 goto onError;
13753 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013754 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013755 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013756 } /* until end */
13757 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013758 PyErr_SetString(PyExc_TypeError,
13759 "not all arguments converted during string formatting");
13760 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013761 }
13762
13763 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013764 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013765 }
13766 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013767 Py_XDECREF(temp);
13768 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013769 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013770
Benjamin Peterson29060642009-01-31 22:14:21 +000013771 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013772 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013773 Py_XDECREF(temp);
13774 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013775 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013776 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013777 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013778 }
13779 return NULL;
13780}
13781
Jeremy Hylton938ace62002-07-17 16:30:39 +000013782static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013783unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13784
Tim Peters6d6c1a32001-08-02 04:15:00 +000013785static PyObject *
13786unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13787{
Benjamin Peterson29060642009-01-31 22:14:21 +000013788 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013789 static char *kwlist[] = {"object", "encoding", "errors", 0};
13790 char *encoding = NULL;
13791 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013792
Benjamin Peterson14339b62009-01-31 16:36:08 +000013793 if (type != &PyUnicode_Type)
13794 return unicode_subtype_new(type, args, kwds);
13795 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013796 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013797 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013798 if (x == NULL) {
13799 Py_INCREF(unicode_empty);
13800 return unicode_empty;
13801 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013802 if (encoding == NULL && errors == NULL)
13803 return PyObject_Str(x);
13804 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013805 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013806}
13807
Guido van Rossume023fe02001-08-30 03:12:59 +000013808static PyObject *
13809unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13810{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013811 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013812 Py_ssize_t length, char_size;
13813 int share_wstr, share_utf8;
13814 unsigned int kind;
13815 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013816
Benjamin Peterson14339b62009-01-31 16:36:08 +000013817 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013818
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013819 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013820 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013821 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013822 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050013823 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013824 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013825 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013826 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013827
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013828 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013829 if (self == NULL) {
13830 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013831 return NULL;
13832 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013833 kind = PyUnicode_KIND(unicode);
13834 length = PyUnicode_GET_LENGTH(unicode);
13835
13836 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013837#ifdef Py_DEBUG
13838 _PyUnicode_HASH(self) = -1;
13839#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013840 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013841#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013842 _PyUnicode_STATE(self).interned = 0;
13843 _PyUnicode_STATE(self).kind = kind;
13844 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013845 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013846 _PyUnicode_STATE(self).ready = 1;
13847 _PyUnicode_WSTR(self) = NULL;
13848 _PyUnicode_UTF8_LENGTH(self) = 0;
13849 _PyUnicode_UTF8(self) = NULL;
13850 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013851 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013852
13853 share_utf8 = 0;
13854 share_wstr = 0;
13855 if (kind == PyUnicode_1BYTE_KIND) {
13856 char_size = 1;
13857 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13858 share_utf8 = 1;
13859 }
13860 else if (kind == PyUnicode_2BYTE_KIND) {
13861 char_size = 2;
13862 if (sizeof(wchar_t) == 2)
13863 share_wstr = 1;
13864 }
13865 else {
13866 assert(kind == PyUnicode_4BYTE_KIND);
13867 char_size = 4;
13868 if (sizeof(wchar_t) == 4)
13869 share_wstr = 1;
13870 }
13871
13872 /* Ensure we won't overflow the length. */
13873 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13874 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013875 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013876 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013877 data = PyObject_MALLOC((length + 1) * char_size);
13878 if (data == NULL) {
13879 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013880 goto onError;
13881 }
13882
Victor Stinnerc3c74152011-10-02 20:39:55 +020013883 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013884 if (share_utf8) {
13885 _PyUnicode_UTF8_LENGTH(self) = length;
13886 _PyUnicode_UTF8(self) = data;
13887 }
13888 if (share_wstr) {
13889 _PyUnicode_WSTR_LENGTH(self) = length;
13890 _PyUnicode_WSTR(self) = (wchar_t *)data;
13891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013892
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013893 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013894 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013895 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013896#ifdef Py_DEBUG
13897 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13898#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013899 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013900 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013901
13902onError:
13903 Py_DECREF(unicode);
13904 Py_DECREF(self);
13905 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013906}
13907
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013908PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013909 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013910\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013911Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013912encoding defaults to the current default string encoding.\n\
13913errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013914
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013915static PyObject *unicode_iter(PyObject *seq);
13916
Guido van Rossumd57fd912000-03-10 22:53:23 +000013917PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013918 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013919 "str", /* tp_name */
13920 sizeof(PyUnicodeObject), /* tp_size */
13921 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013922 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013923 (destructor)unicode_dealloc, /* tp_dealloc */
13924 0, /* tp_print */
13925 0, /* tp_getattr */
13926 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013927 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013928 unicode_repr, /* tp_repr */
13929 &unicode_as_number, /* tp_as_number */
13930 &unicode_as_sequence, /* tp_as_sequence */
13931 &unicode_as_mapping, /* tp_as_mapping */
13932 (hashfunc) unicode_hash, /* tp_hash*/
13933 0, /* tp_call*/
13934 (reprfunc) unicode_str, /* tp_str */
13935 PyObject_GenericGetAttr, /* tp_getattro */
13936 0, /* tp_setattro */
13937 0, /* tp_as_buffer */
13938 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013939 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013940 unicode_doc, /* tp_doc */
13941 0, /* tp_traverse */
13942 0, /* tp_clear */
13943 PyUnicode_RichCompare, /* tp_richcompare */
13944 0, /* tp_weaklistoffset */
13945 unicode_iter, /* tp_iter */
13946 0, /* tp_iternext */
13947 unicode_methods, /* tp_methods */
13948 0, /* tp_members */
13949 0, /* tp_getset */
13950 &PyBaseObject_Type, /* tp_base */
13951 0, /* tp_dict */
13952 0, /* tp_descr_get */
13953 0, /* tp_descr_set */
13954 0, /* tp_dictoffset */
13955 0, /* tp_init */
13956 0, /* tp_alloc */
13957 unicode_new, /* tp_new */
13958 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013959};
13960
13961/* Initialize the Unicode implementation */
13962
Victor Stinner3a50e702011-10-18 21:21:00 +020013963int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013964{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013965 int i;
13966
Thomas Wouters477c8d52006-05-27 19:21:47 +000013967 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013968 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013969 0x000A, /* LINE FEED */
13970 0x000D, /* CARRIAGE RETURN */
13971 0x001C, /* FILE SEPARATOR */
13972 0x001D, /* GROUP SEPARATOR */
13973 0x001E, /* RECORD SEPARATOR */
13974 0x0085, /* NEXT LINE */
13975 0x2028, /* LINE SEPARATOR */
13976 0x2029, /* PARAGRAPH SEPARATOR */
13977 };
13978
Fred Drakee4315f52000-05-09 19:53:39 +000013979 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013980 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013981 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013982 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013983 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013984
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013985 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013986 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013987 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013988 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013989
13990 /* initialize the linebreak bloom filter */
13991 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013992 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013993 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013994
13995 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013996
13997#ifdef HAVE_MBCS
13998 winver.dwOSVersionInfoSize = sizeof(winver);
13999 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14000 PyErr_SetFromWindowsErr(0);
14001 return -1;
14002 }
14003#endif
14004 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014005}
14006
14007/* Finalize the Unicode implementation */
14008
Christian Heimesa156e092008-02-16 07:38:31 +000014009int
14010PyUnicode_ClearFreeList(void)
14011{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014012 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014013}
14014
Guido van Rossumd57fd912000-03-10 22:53:23 +000014015void
Thomas Wouters78890102000-07-22 19:25:51 +000014016_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014017{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014018 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014019
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014020 Py_XDECREF(unicode_empty);
14021 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014022
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014023 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014024 if (unicode_latin1[i]) {
14025 Py_DECREF(unicode_latin1[i]);
14026 unicode_latin1[i] = NULL;
14027 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014028 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014029 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014030 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014031}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014032
Walter Dörwald16807132007-05-25 13:52:07 +000014033void
14034PyUnicode_InternInPlace(PyObject **p)
14035{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014036 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014037 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014038#ifdef Py_DEBUG
14039 assert(s != NULL);
14040 assert(_PyUnicode_CHECK(s));
14041#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014042 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014043 return;
14044#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014045 /* If it's a subclass, we don't really know what putting
14046 it in the interned dict might do. */
14047 if (!PyUnicode_CheckExact(s))
14048 return;
14049 if (PyUnicode_CHECK_INTERNED(s))
14050 return;
14051 if (interned == NULL) {
14052 interned = PyDict_New();
14053 if (interned == NULL) {
14054 PyErr_Clear(); /* Don't leave an exception */
14055 return;
14056 }
14057 }
14058 /* It might be that the GetItem call fails even
14059 though the key is present in the dictionary,
14060 namely when this happens during a stack overflow. */
14061 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014062 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014063 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014064
Benjamin Peterson29060642009-01-31 22:14:21 +000014065 if (t) {
14066 Py_INCREF(t);
14067 Py_DECREF(*p);
14068 *p = t;
14069 return;
14070 }
Walter Dörwald16807132007-05-25 13:52:07 +000014071
Benjamin Peterson14339b62009-01-31 16:36:08 +000014072 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014073 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014074 PyErr_Clear();
14075 PyThreadState_GET()->recursion_critical = 0;
14076 return;
14077 }
14078 PyThreadState_GET()->recursion_critical = 0;
14079 /* The two references in interned are not counted by refcnt.
14080 The deallocator will take care of this */
14081 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014082 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014083}
14084
14085void
14086PyUnicode_InternImmortal(PyObject **p)
14087{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014088 PyUnicode_InternInPlace(p);
14089 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014090 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014091 Py_INCREF(*p);
14092 }
Walter Dörwald16807132007-05-25 13:52:07 +000014093}
14094
14095PyObject *
14096PyUnicode_InternFromString(const char *cp)
14097{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014098 PyObject *s = PyUnicode_FromString(cp);
14099 if (s == NULL)
14100 return NULL;
14101 PyUnicode_InternInPlace(&s);
14102 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014103}
14104
Alexander Belopolsky40018472011-02-26 01:02:56 +000014105void
14106_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014107{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014108 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014109 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014110 Py_ssize_t i, n;
14111 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014112
Benjamin Peterson14339b62009-01-31 16:36:08 +000014113 if (interned == NULL || !PyDict_Check(interned))
14114 return;
14115 keys = PyDict_Keys(interned);
14116 if (keys == NULL || !PyList_Check(keys)) {
14117 PyErr_Clear();
14118 return;
14119 }
Walter Dörwald16807132007-05-25 13:52:07 +000014120
Benjamin Peterson14339b62009-01-31 16:36:08 +000014121 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14122 detector, interned unicode strings are not forcibly deallocated;
14123 rather, we give them their stolen references back, and then clear
14124 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014125
Benjamin Peterson14339b62009-01-31 16:36:08 +000014126 n = PyList_GET_SIZE(keys);
14127 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014128 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014129 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014130 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014131 if (PyUnicode_READY(s) == -1) {
14132 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014133 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014134 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014135 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014136 case SSTATE_NOT_INTERNED:
14137 /* XXX Shouldn't happen */
14138 break;
14139 case SSTATE_INTERNED_IMMORTAL:
14140 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014141 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014142 break;
14143 case SSTATE_INTERNED_MORTAL:
14144 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014145 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014146 break;
14147 default:
14148 Py_FatalError("Inconsistent interned string state.");
14149 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014150 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014151 }
14152 fprintf(stderr, "total size of all interned strings: "
14153 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14154 "mortal/immortal\n", mortal_size, immortal_size);
14155 Py_DECREF(keys);
14156 PyDict_Clear(interned);
14157 Py_DECREF(interned);
14158 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014159}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014160
14161
14162/********************* Unicode Iterator **************************/
14163
14164typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014165 PyObject_HEAD
14166 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014167 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014168} unicodeiterobject;
14169
14170static void
14171unicodeiter_dealloc(unicodeiterobject *it)
14172{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014173 _PyObject_GC_UNTRACK(it);
14174 Py_XDECREF(it->it_seq);
14175 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014176}
14177
14178static int
14179unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14180{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014181 Py_VISIT(it->it_seq);
14182 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014183}
14184
14185static PyObject *
14186unicodeiter_next(unicodeiterobject *it)
14187{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014188 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014189
Benjamin Peterson14339b62009-01-31 16:36:08 +000014190 assert(it != NULL);
14191 seq = it->it_seq;
14192 if (seq == NULL)
14193 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014194 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014196 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14197 int kind = PyUnicode_KIND(seq);
14198 void *data = PyUnicode_DATA(seq);
14199 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14200 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014201 if (item != NULL)
14202 ++it->it_index;
14203 return item;
14204 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014205
Benjamin Peterson14339b62009-01-31 16:36:08 +000014206 Py_DECREF(seq);
14207 it->it_seq = NULL;
14208 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014209}
14210
14211static PyObject *
14212unicodeiter_len(unicodeiterobject *it)
14213{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014214 Py_ssize_t len = 0;
14215 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014216 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014217 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014218}
14219
14220PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14221
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014222static PyObject *
14223unicodeiter_reduce(unicodeiterobject *it)
14224{
14225 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014226 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014227 it->it_seq, it->it_index);
14228 } else {
14229 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14230 if (u == NULL)
14231 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014232 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014233 }
14234}
14235
14236PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14237
14238static PyObject *
14239unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14240{
14241 Py_ssize_t index = PyLong_AsSsize_t(state);
14242 if (index == -1 && PyErr_Occurred())
14243 return NULL;
14244 if (index < 0)
14245 index = 0;
14246 it->it_index = index;
14247 Py_RETURN_NONE;
14248}
14249
14250PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14251
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014252static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014253 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014254 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014255 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14256 reduce_doc},
14257 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14258 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014259 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014260};
14261
14262PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014263 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14264 "str_iterator", /* tp_name */
14265 sizeof(unicodeiterobject), /* tp_basicsize */
14266 0, /* tp_itemsize */
14267 /* methods */
14268 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14269 0, /* tp_print */
14270 0, /* tp_getattr */
14271 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014272 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014273 0, /* tp_repr */
14274 0, /* tp_as_number */
14275 0, /* tp_as_sequence */
14276 0, /* tp_as_mapping */
14277 0, /* tp_hash */
14278 0, /* tp_call */
14279 0, /* tp_str */
14280 PyObject_GenericGetAttr, /* tp_getattro */
14281 0, /* tp_setattro */
14282 0, /* tp_as_buffer */
14283 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14284 0, /* tp_doc */
14285 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14286 0, /* tp_clear */
14287 0, /* tp_richcompare */
14288 0, /* tp_weaklistoffset */
14289 PyObject_SelfIter, /* tp_iter */
14290 (iternextfunc)unicodeiter_next, /* tp_iternext */
14291 unicodeiter_methods, /* tp_methods */
14292 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014293};
14294
14295static PyObject *
14296unicode_iter(PyObject *seq)
14297{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014298 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014299
Benjamin Peterson14339b62009-01-31 16:36:08 +000014300 if (!PyUnicode_Check(seq)) {
14301 PyErr_BadInternalCall();
14302 return NULL;
14303 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014304 if (PyUnicode_READY(seq) == -1)
14305 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014306 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14307 if (it == NULL)
14308 return NULL;
14309 it->it_index = 0;
14310 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014311 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014312 _PyObject_GC_TRACK(it);
14313 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014314}
14315
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014316
14317size_t
14318Py_UNICODE_strlen(const Py_UNICODE *u)
14319{
14320 int res = 0;
14321 while(*u++)
14322 res++;
14323 return res;
14324}
14325
14326Py_UNICODE*
14327Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14328{
14329 Py_UNICODE *u = s1;
14330 while ((*u++ = *s2++));
14331 return s1;
14332}
14333
14334Py_UNICODE*
14335Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14336{
14337 Py_UNICODE *u = s1;
14338 while ((*u++ = *s2++))
14339 if (n-- == 0)
14340 break;
14341 return s1;
14342}
14343
14344Py_UNICODE*
14345Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14346{
14347 Py_UNICODE *u1 = s1;
14348 u1 += Py_UNICODE_strlen(u1);
14349 Py_UNICODE_strcpy(u1, s2);
14350 return s1;
14351}
14352
14353int
14354Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14355{
14356 while (*s1 && *s2 && *s1 == *s2)
14357 s1++, s2++;
14358 if (*s1 && *s2)
14359 return (*s1 < *s2) ? -1 : +1;
14360 if (*s1)
14361 return 1;
14362 if (*s2)
14363 return -1;
14364 return 0;
14365}
14366
14367int
14368Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14369{
14370 register Py_UNICODE u1, u2;
14371 for (; n != 0; n--) {
14372 u1 = *s1;
14373 u2 = *s2;
14374 if (u1 != u2)
14375 return (u1 < u2) ? -1 : +1;
14376 if (u1 == '\0')
14377 return 0;
14378 s1++;
14379 s2++;
14380 }
14381 return 0;
14382}
14383
14384Py_UNICODE*
14385Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14386{
14387 const Py_UNICODE *p;
14388 for (p = s; *p; p++)
14389 if (*p == c)
14390 return (Py_UNICODE*)p;
14391 return NULL;
14392}
14393
14394Py_UNICODE*
14395Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14396{
14397 const Py_UNICODE *p;
14398 p = s + Py_UNICODE_strlen(s);
14399 while (p != s) {
14400 p--;
14401 if (*p == c)
14402 return (Py_UNICODE*)p;
14403 }
14404 return NULL;
14405}
Victor Stinner331ea922010-08-10 16:37:20 +000014406
Victor Stinner71133ff2010-09-01 23:43:53 +000014407Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014408PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014409{
Victor Stinner577db2c2011-10-11 22:12:48 +020014410 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014411 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014413 if (!PyUnicode_Check(unicode)) {
14414 PyErr_BadArgument();
14415 return NULL;
14416 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014417 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014418 if (u == NULL)
14419 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014420 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014421 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014422 PyErr_NoMemory();
14423 return NULL;
14424 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014425 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014426 size *= sizeof(Py_UNICODE);
14427 copy = PyMem_Malloc(size);
14428 if (copy == NULL) {
14429 PyErr_NoMemory();
14430 return NULL;
14431 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014432 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014433 return copy;
14434}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014435
Georg Brandl66c221e2010-10-14 07:04:07 +000014436/* A _string module, to export formatter_parser and formatter_field_name_split
14437 to the string.Formatter class implemented in Python. */
14438
14439static PyMethodDef _string_methods[] = {
14440 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14441 METH_O, PyDoc_STR("split the argument as a field name")},
14442 {"formatter_parser", (PyCFunction) formatter_parser,
14443 METH_O, PyDoc_STR("parse the argument as a format string")},
14444 {NULL, NULL}
14445};
14446
14447static struct PyModuleDef _string_module = {
14448 PyModuleDef_HEAD_INIT,
14449 "_string",
14450 PyDoc_STR("string helper module"),
14451 0,
14452 _string_methods,
14453 NULL,
14454 NULL,
14455 NULL,
14456 NULL
14457};
14458
14459PyMODINIT_FUNC
14460PyInit__string(void)
14461{
14462 return PyModule_Create(&_string_module);
14463}
14464
14465
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014466#ifdef __cplusplus
14467}
14468#endif