blob: 0e7493bb61492f21cc4467734df083f956718552 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinnere6abb482012-05-02 01:15:40 +0200114/* Optimized version of Py_MAX() to compute the maximum character:
115 use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2) \
117 ((maxchar1) | (maxchar2))
118
Victor Stinner910337b2011-10-03 03:20:16 +0200119#undef PyUnicode_READY
120#define PyUnicode_READY(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200123 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100124 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
162 _iter + (n & ~ (Py_ssize_t) 3); \
163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
182static PyObject *interned;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200185static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000186
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* Single character Unicode strings in the Latin-1 range are being
191 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200192static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193
Christian Heimes190d79e2008-01-30 11:58:22 +0000194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000199/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000200/* case 0x000C: * FORM FEED */
201/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 1, 1, 1, 1, 1, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x001C: * FILE SEPARATOR */
205/* case 0x001D: * GROUP SEPARATOR */
206/* case 0x001E: * RECORD SEPARATOR */
207/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 1, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000214
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000223};
224
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200225/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200227static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200228static void copy_characters(
229 PyObject *to, Py_ssize_t to_start,
230 PyObject *from, Py_ssize_t from_start,
231 Py_ssize_t how_many);
Victor Stinner488fa492011-12-12 00:01:39 +0100232static int unicode_modifiable(PyObject *unicode);
233
Victor Stinnerfe226c02011-10-03 03:52:20 +0200234
Alexander Belopolsky40018472011-02-26 01:02:56 +0000235static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200236unicode_fromascii(const unsigned char *s, Py_ssize_t size);
237static PyObject *
238_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
239static PyObject *
240_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
241static PyObject *
242_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
243
244static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000246 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100247 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000248 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
249
Alexander Belopolsky40018472011-02-26 01:02:56 +0000250static void
251raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300252 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100253 PyObject *unicode,
254 Py_ssize_t startpos, Py_ssize_t endpos,
255 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000256
Christian Heimes190d79e2008-01-30 11:58:22 +0000257/* Same for linebreaks */
258static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000261/* 0x000B, * LINE TABULATION */
262/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000263/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000264 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000265 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000266/* 0x001C, * FILE SEPARATOR */
267/* 0x001D, * GROUP SEPARATOR */
268/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 1, 1, 1, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000274
Benjamin Peterson14339b62009-01-31 16:36:08 +0000275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000283};
284
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300285/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
286 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000288PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000289{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000290#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000292#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000293 /* This is actually an illegal character, so it should
294 not be passed to unichr. */
295 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000296#endif
297}
298
Victor Stinner910337b2011-10-03 03:20:16 +0200299#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200300int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100301_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200302{
303 PyASCIIObject *ascii;
304 unsigned int kind;
305
306 assert(PyUnicode_Check(op));
307
308 ascii = (PyASCIIObject *)op;
309 kind = ascii->state.kind;
310
Victor Stinnera3b334d2011-10-03 13:53:37 +0200311 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200312 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(ascii->state.ready == 1);
314 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200315 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200316 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200317 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200318
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 if (ascii->state.compact == 1) {
320 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(kind == PyUnicode_1BYTE_KIND
322 || kind == PyUnicode_2BYTE_KIND
323 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200326 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 }
328 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
330
331 data = unicode->data.any;
332 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->length == 0);
334 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200335 assert(ascii->state.compact == 0);
336 assert(ascii->state.ascii == 0);
337 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100338 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 assert(ascii->wstr != NULL);
340 assert(data == NULL);
341 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200342 }
343 else {
344 assert(kind == PyUnicode_1BYTE_KIND
345 || kind == PyUnicode_2BYTE_KIND
346 || kind == PyUnicode_4BYTE_KIND);
347 assert(ascii->state.compact == 0);
348 assert(ascii->state.ready == 1);
349 assert(data != NULL);
350 if (ascii->state.ascii) {
351 assert (compact->utf8 == data);
352 assert (compact->utf8_length == ascii->length);
353 }
354 else
355 assert (compact->utf8 != data);
356 }
357 }
358 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200359 if (
360#if SIZEOF_WCHAR_T == 2
361 kind == PyUnicode_2BYTE_KIND
362#else
363 kind == PyUnicode_4BYTE_KIND
364#endif
365 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200366 {
367 assert(ascii->wstr == data);
368 assert(compact->wstr_length == ascii->length);
369 } else
370 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200372
373 if (compact->utf8 == NULL)
374 assert(compact->utf8_length == 0);
375 if (ascii->wstr == NULL)
376 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200377 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200378 /* check that the best kind is used */
379 if (check_content && kind != PyUnicode_WCHAR_KIND)
380 {
381 Py_ssize_t i;
382 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200383 void *data;
384 Py_UCS4 ch;
385
386 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 for (i=0; i < ascii->length; i++)
388 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200389 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 if (ch > maxchar)
391 maxchar = ch;
392 }
393 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100394 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100396 assert(maxchar <= 255);
397 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 else
399 assert(maxchar < 128);
400 }
Victor Stinner77faf692011-11-20 18:56:05 +0100401 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200402 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100403 assert(maxchar <= 0xFFFF);
404 }
405 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100407 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100408 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200409 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400411 return 1;
412}
Victor Stinner910337b2011-10-03 03:20:16 +0200413#endif
414
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100415static PyObject*
416unicode_result_wchar(PyObject *unicode)
417{
418#ifndef Py_DEBUG
419 Py_ssize_t len;
420
421 assert(Py_REFCNT(unicode) == 1);
422
423 len = _PyUnicode_WSTR_LENGTH(unicode);
424 if (len == 0) {
425 Py_INCREF(unicode_empty);
426 Py_DECREF(unicode);
427 return unicode_empty;
428 }
429
430 if (len == 1) {
431 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
432 if (ch < 256) {
433 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
434 Py_DECREF(unicode);
435 return latin1_char;
436 }
437 }
438
439 if (_PyUnicode_Ready(unicode) < 0) {
440 Py_XDECREF(unicode);
441 return NULL;
442 }
443#else
444 /* don't make the result ready in debug mode to ensure that the caller
445 makes the string ready before using it */
446 assert(_PyUnicode_CheckConsistency(unicode, 1));
447#endif
448 return unicode;
449}
450
451static PyObject*
452unicode_result_ready(PyObject *unicode)
453{
454 Py_ssize_t length;
455
456 length = PyUnicode_GET_LENGTH(unicode);
457 if (length == 0) {
458 if (unicode != unicode_empty) {
459 Py_INCREF(unicode_empty);
460 Py_DECREF(unicode);
461 }
462 return unicode_empty;
463 }
464
465 if (length == 1) {
466 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
467 if (ch < 256) {
468 PyObject *latin1_char = unicode_latin1[ch];
469 if (latin1_char != NULL) {
470 if (unicode != latin1_char) {
471 Py_INCREF(latin1_char);
472 Py_DECREF(unicode);
473 }
474 return latin1_char;
475 }
476 else {
477 assert(_PyUnicode_CheckConsistency(unicode, 1));
478 Py_INCREF(unicode);
479 unicode_latin1[ch] = unicode;
480 return unicode;
481 }
482 }
483 }
484
485 assert(_PyUnicode_CheckConsistency(unicode, 1));
486 return unicode;
487}
488
489static PyObject*
490unicode_result(PyObject *unicode)
491{
492 assert(_PyUnicode_CHECK(unicode));
493 if (PyUnicode_IS_READY(unicode))
494 return unicode_result_ready(unicode);
495 else
496 return unicode_result_wchar(unicode);
497}
498
Victor Stinnerc4b49542011-12-11 22:44:26 +0100499static PyObject*
500unicode_result_unchanged(PyObject *unicode)
501{
502 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500503 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100504 return NULL;
505 Py_INCREF(unicode);
506 return unicode;
507 }
508 else
509 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100510 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100511}
512
Victor Stinner3a50e702011-10-18 21:21:00 +0200513#ifdef HAVE_MBCS
514static OSVERSIONINFOEX winver;
515#endif
516
Thomas Wouters477c8d52006-05-27 19:21:47 +0000517/* --- Bloom Filters ----------------------------------------------------- */
518
519/* stuff to implement simple "bloom filters" for Unicode characters.
520 to keep things simple, we use a single bitmask, using the least 5
521 bits from each unicode characters as the bit index. */
522
523/* the linebreak mask is set up by Unicode_Init below */
524
Antoine Pitrouf068f942010-01-13 14:19:12 +0000525#if LONG_BIT >= 128
526#define BLOOM_WIDTH 128
527#elif LONG_BIT >= 64
528#define BLOOM_WIDTH 64
529#elif LONG_BIT >= 32
530#define BLOOM_WIDTH 32
531#else
532#error "LONG_BIT is smaller than 32"
533#endif
534
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535#define BLOOM_MASK unsigned long
536
537static BLOOM_MASK bloom_linebreak;
538
Antoine Pitrouf068f942010-01-13 14:19:12 +0000539#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
540#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000541
Benjamin Peterson29060642009-01-31 22:14:21 +0000542#define BLOOM_LINEBREAK(ch) \
543 ((ch) < 128U ? ascii_linebreak[(ch)] : \
544 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Alexander Belopolsky40018472011-02-26 01:02:56 +0000546Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548{
549 /* calculate simple bloom-style bitmask for a given unicode string */
550
Antoine Pitrouf068f942010-01-13 14:19:12 +0000551 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000552 Py_ssize_t i;
553
554 mask = 0;
555 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200556 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000557
558 return mask;
559}
560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561#define BLOOM_MEMBER(mask, chr, str) \
562 (BLOOM(mask, chr) \
563 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000564
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200565/* Compilation of templated routines */
566
567#include "stringlib/asciilib.h"
568#include "stringlib/fastsearch.h"
569#include "stringlib/partition.h"
570#include "stringlib/split.h"
571#include "stringlib/count.h"
572#include "stringlib/find.h"
573#include "stringlib/find_max_char.h"
574#include "stringlib/localeutil.h"
575#include "stringlib/undef.h"
576
577#include "stringlib/ucs1lib.h"
578#include "stringlib/fastsearch.h"
579#include "stringlib/partition.h"
580#include "stringlib/split.h"
581#include "stringlib/count.h"
582#include "stringlib/find.h"
583#include "stringlib/find_max_char.h"
584#include "stringlib/localeutil.h"
585#include "stringlib/undef.h"
586
587#include "stringlib/ucs2lib.h"
588#include "stringlib/fastsearch.h"
589#include "stringlib/partition.h"
590#include "stringlib/split.h"
591#include "stringlib/count.h"
592#include "stringlib/find.h"
593#include "stringlib/find_max_char.h"
594#include "stringlib/localeutil.h"
595#include "stringlib/undef.h"
596
597#include "stringlib/ucs4lib.h"
598#include "stringlib/fastsearch.h"
599#include "stringlib/partition.h"
600#include "stringlib/split.h"
601#include "stringlib/count.h"
602#include "stringlib/find.h"
603#include "stringlib/find_max_char.h"
604#include "stringlib/localeutil.h"
605#include "stringlib/undef.h"
606
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200607#include "stringlib/unicodedefs.h"
608#include "stringlib/fastsearch.h"
609#include "stringlib/count.h"
610#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100611#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612
Guido van Rossumd57fd912000-03-10 22:53:23 +0000613/* --- Unicode Object ----------------------------------------------------- */
614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200616fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200617
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200618Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
619 Py_ssize_t size, Py_UCS4 ch,
620 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200621{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200622 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
623
624 switch (kind) {
625 case PyUnicode_1BYTE_KIND:
626 {
627 Py_UCS1 ch1 = (Py_UCS1) ch;
628 if (ch1 == ch)
629 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
630 else
631 return -1;
632 }
633 case PyUnicode_2BYTE_KIND:
634 {
635 Py_UCS2 ch2 = (Py_UCS2) ch;
636 if (ch2 == ch)
637 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
638 else
639 return -1;
640 }
641 case PyUnicode_4BYTE_KIND:
642 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
643 default:
644 assert(0);
645 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200646 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647}
648
Victor Stinnerfe226c02011-10-03 03:52:20 +0200649static PyObject*
650resize_compact(PyObject *unicode, Py_ssize_t length)
651{
652 Py_ssize_t char_size;
653 Py_ssize_t struct_size;
654 Py_ssize_t new_size;
655 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100656 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100658 assert(PyUnicode_IS_COMPACT(unicode));
659
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200660 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100661 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 struct_size = sizeof(PyASCIIObject);
663 else
664 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200665 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200666
Victor Stinnerfe226c02011-10-03 03:52:20 +0200667 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
668 PyErr_NoMemory();
669 return NULL;
670 }
671 new_size = (struct_size + (length + 1) * char_size);
672
Victor Stinner84def372011-12-11 20:04:56 +0100673 _Py_DEC_REFTOTAL;
674 _Py_ForgetReference(unicode);
675
676 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
677 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100678 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200679 PyErr_NoMemory();
680 return NULL;
681 }
Victor Stinner84def372011-12-11 20:04:56 +0100682 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200683 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100684
Victor Stinnerfe226c02011-10-03 03:52:20 +0200685 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200686 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200687 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100688 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200689 _PyUnicode_WSTR_LENGTH(unicode) = length;
690 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200691 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
692 length, 0);
693 return unicode;
694}
695
Alexander Belopolsky40018472011-02-26 01:02:56 +0000696static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200697resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000698{
Victor Stinner95663112011-10-04 01:03:50 +0200699 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100700 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200701 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200702 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000703
Victor Stinnerfe226c02011-10-03 03:52:20 +0200704 if (PyUnicode_IS_READY(unicode)) {
705 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200706 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200707 void *data;
708
709 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200710 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200711 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
712 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200713
714 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
715 PyErr_NoMemory();
716 return -1;
717 }
718 new_size = (length + 1) * char_size;
719
Victor Stinner7a9105a2011-12-12 00:13:42 +0100720 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
721 {
722 PyObject_DEL(_PyUnicode_UTF8(unicode));
723 _PyUnicode_UTF8(unicode) = NULL;
724 _PyUnicode_UTF8_LENGTH(unicode) = 0;
725 }
726
Victor Stinnerfe226c02011-10-03 03:52:20 +0200727 data = (PyObject *)PyObject_REALLOC(data, new_size);
728 if (data == NULL) {
729 PyErr_NoMemory();
730 return -1;
731 }
732 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200733 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200734 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200735 _PyUnicode_WSTR_LENGTH(unicode) = length;
736 }
737 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200738 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200739 _PyUnicode_UTF8_LENGTH(unicode) = length;
740 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 _PyUnicode_LENGTH(unicode) = length;
742 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200743 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200744 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200745 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200746 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200747 }
Victor Stinner95663112011-10-04 01:03:50 +0200748 assert(_PyUnicode_WSTR(unicode) != NULL);
749
750 /* check for integer overflow */
751 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
752 PyErr_NoMemory();
753 return -1;
754 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100755 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200756 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100757 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200758 if (!wstr) {
759 PyErr_NoMemory();
760 return -1;
761 }
762 _PyUnicode_WSTR(unicode) = wstr;
763 _PyUnicode_WSTR(unicode)[length] = 0;
764 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200765 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000766 return 0;
767}
768
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769static PyObject*
770resize_copy(PyObject *unicode, Py_ssize_t length)
771{
772 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100773 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100775
Benjamin Petersonbac79492012-01-14 13:34:47 -0500776 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100777 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
780 if (copy == NULL)
781 return NULL;
782
783 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200784 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200785 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200786 }
787 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200788 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100789
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200790 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200791 if (w == NULL)
792 return NULL;
793 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
794 copy_length = Py_MIN(copy_length, length);
795 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
796 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200797 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200798 }
799}
800
Guido van Rossumd57fd912000-03-10 22:53:23 +0000801/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000802 Ux0000 terminated; some code (e.g. new_identifier)
803 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000804
805 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000806 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000807
808*/
809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200810#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200811static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200812#endif
813
Alexander Belopolsky40018472011-02-26 01:02:56 +0000814static PyUnicodeObject *
815_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000816{
817 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200818 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000819
Thomas Wouters477c8d52006-05-27 19:21:47 +0000820 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821 if (length == 0 && unicode_empty != NULL) {
822 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200823 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000824 }
825
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000826 /* Ensure we won't overflow the size. */
827 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
828 return (PyUnicodeObject *)PyErr_NoMemory();
829 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200830 if (length < 0) {
831 PyErr_SetString(PyExc_SystemError,
832 "Negative size passed to _PyUnicode_New");
833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834 }
835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200836#ifdef Py_DEBUG
837 ++unicode_old_new_calls;
838#endif
839
840 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
841 if (unicode == NULL)
842 return NULL;
843 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
844 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
845 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100846 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000847 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100848 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000849 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200850
Jeremy Hyltond8082792003-09-16 19:41:39 +0000851 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000852 * the caller fails before initializing str -- unicode_resize()
853 * reads str[0], and the Keep-Alive optimization can keep memory
854 * allocated for str alive across a call to unicode_dealloc(unicode).
855 * We don't want unicode_resize to read uninitialized memory in
856 * that case.
857 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200858 _PyUnicode_WSTR(unicode)[0] = 0;
859 _PyUnicode_WSTR(unicode)[length] = 0;
860 _PyUnicode_WSTR_LENGTH(unicode) = length;
861 _PyUnicode_HASH(unicode) = -1;
862 _PyUnicode_STATE(unicode).interned = 0;
863 _PyUnicode_STATE(unicode).kind = 0;
864 _PyUnicode_STATE(unicode).compact = 0;
865 _PyUnicode_STATE(unicode).ready = 0;
866 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200867 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200868 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200869 _PyUnicode_UTF8(unicode) = NULL;
870 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100871 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000872 return unicode;
873}
874
Victor Stinnerf42dc442011-10-02 23:33:16 +0200875static const char*
876unicode_kind_name(PyObject *unicode)
877{
Victor Stinner42dfd712011-10-03 14:41:45 +0200878 /* don't check consistency: unicode_kind_name() is called from
879 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200880 if (!PyUnicode_IS_COMPACT(unicode))
881 {
882 if (!PyUnicode_IS_READY(unicode))
883 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600884 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200885 {
886 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200887 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200888 return "legacy ascii";
889 else
890 return "legacy latin1";
891 case PyUnicode_2BYTE_KIND:
892 return "legacy UCS2";
893 case PyUnicode_4BYTE_KIND:
894 return "legacy UCS4";
895 default:
896 return "<legacy invalid kind>";
897 }
898 }
899 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600900 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200901 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200902 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200903 return "ascii";
904 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200905 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200906 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200907 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200908 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200909 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200910 default:
911 return "<invalid compact kind>";
912 }
913}
914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200916static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917
918/* Functions wrapping macros for use in debugger */
919char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200920 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921}
922
923void *_PyUnicode_compact_data(void *unicode) {
924 return _PyUnicode_COMPACT_DATA(unicode);
925}
926void *_PyUnicode_data(void *unicode){
927 printf("obj %p\n", unicode);
928 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
929 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
930 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
931 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
932 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
933 return PyUnicode_DATA(unicode);
934}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200935
936void
937_PyUnicode_Dump(PyObject *op)
938{
939 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200940 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
941 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
942 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200943
Victor Stinnera849a4b2011-10-03 12:12:11 +0200944 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200945 {
946 if (ascii->state.ascii)
947 data = (ascii + 1);
948 else
949 data = (compact + 1);
950 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 else
952 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200953 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
954
Victor Stinnera849a4b2011-10-03 12:12:11 +0200955 if (ascii->wstr == data)
956 printf("shared ");
957 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200958
Victor Stinnera3b334d2011-10-03 13:53:37 +0200959 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200960 printf(" (%zu), ", compact->wstr_length);
961 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
962 printf("shared ");
963 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200964 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200965 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200966}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200967#endif
968
969PyObject *
970PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
971{
972 PyObject *obj;
973 PyCompactUnicodeObject *unicode;
974 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200975 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200976 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977 Py_ssize_t char_size;
978 Py_ssize_t struct_size;
979
980 /* Optimization for empty strings */
981 if (size == 0 && unicode_empty != NULL) {
982 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200983 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984 }
985
986#ifdef Py_DEBUG
987 ++unicode_new_new_calls;
988#endif
989
Victor Stinner9e9d6892011-10-04 01:02:02 +0200990 is_ascii = 0;
991 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200992 struct_size = sizeof(PyCompactUnicodeObject);
993 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200994 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995 char_size = 1;
996 is_ascii = 1;
997 struct_size = sizeof(PyASCIIObject);
998 }
999 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001000 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001 char_size = 1;
1002 }
1003 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001004 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001005 char_size = 2;
1006 if (sizeof(wchar_t) == 2)
1007 is_sharing = 1;
1008 }
1009 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001010 if (maxchar > MAX_UNICODE) {
1011 PyErr_SetString(PyExc_SystemError,
1012 "invalid maximum character passed to PyUnicode_New");
1013 return NULL;
1014 }
Victor Stinner8f825062012-04-27 13:55:39 +02001015 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001016 char_size = 4;
1017 if (sizeof(wchar_t) == 4)
1018 is_sharing = 1;
1019 }
1020
1021 /* Ensure we won't overflow the size. */
1022 if (size < 0) {
1023 PyErr_SetString(PyExc_SystemError,
1024 "Negative size passed to PyUnicode_New");
1025 return NULL;
1026 }
1027 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1028 return PyErr_NoMemory();
1029
1030 /* Duplicated allocation code from _PyObject_New() instead of a call to
1031 * PyObject_New() so we are able to allocate space for the object and
1032 * it's data buffer.
1033 */
1034 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1035 if (obj == NULL)
1036 return PyErr_NoMemory();
1037 obj = PyObject_INIT(obj, &PyUnicode_Type);
1038 if (obj == NULL)
1039 return NULL;
1040
1041 unicode = (PyCompactUnicodeObject *)obj;
1042 if (is_ascii)
1043 data = ((PyASCIIObject*)obj) + 1;
1044 else
1045 data = unicode + 1;
1046 _PyUnicode_LENGTH(unicode) = size;
1047 _PyUnicode_HASH(unicode) = -1;
1048 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001049 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 _PyUnicode_STATE(unicode).compact = 1;
1051 _PyUnicode_STATE(unicode).ready = 1;
1052 _PyUnicode_STATE(unicode).ascii = is_ascii;
1053 if (is_ascii) {
1054 ((char*)data)[size] = 0;
1055 _PyUnicode_WSTR(unicode) = NULL;
1056 }
Victor Stinner8f825062012-04-27 13:55:39 +02001057 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 ((char*)data)[size] = 0;
1059 _PyUnicode_WSTR(unicode) = NULL;
1060 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001062 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001063 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 else {
1065 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001066 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001067 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001069 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 ((Py_UCS4*)data)[size] = 0;
1071 if (is_sharing) {
1072 _PyUnicode_WSTR_LENGTH(unicode) = size;
1073 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1074 }
1075 else {
1076 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1077 _PyUnicode_WSTR(unicode) = NULL;
1078 }
1079 }
Victor Stinner8f825062012-04-27 13:55:39 +02001080#ifdef Py_DEBUG
1081 /* Fill the data with invalid characters to detect bugs earlier.
1082 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1083 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1084 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1085 memset(data, 0xff, size * kind);
1086#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001087 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 return obj;
1089}
1090
1091#if SIZEOF_WCHAR_T == 2
1092/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1093 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001094 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095
1096 This function assumes that unicode can hold one more code point than wstr
1097 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001098static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001099unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001100 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001101{
1102 const wchar_t *iter;
1103 Py_UCS4 *ucs4_out;
1104
Victor Stinner910337b2011-10-03 03:20:16 +02001105 assert(unicode != NULL);
1106 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1108 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1109
1110 for (iter = begin; iter < end; ) {
1111 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1112 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001113 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1114 && (iter+1) < end
1115 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001116 {
Victor Stinner551ac952011-11-29 22:58:13 +01001117 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 iter += 2;
1119 }
1120 else {
1121 *ucs4_out++ = *iter;
1122 iter++;
1123 }
1124 }
1125 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1126 _PyUnicode_GET_LENGTH(unicode)));
1127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128}
1129#endif
1130
Victor Stinnercd9950f2011-10-02 00:34:53 +02001131static int
Victor Stinner488fa492011-12-12 00:01:39 +01001132unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001133{
Victor Stinner488fa492011-12-12 00:01:39 +01001134 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001135 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001136 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001137 return -1;
1138 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001139 return 0;
1140}
1141
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001142static int
1143_copy_characters(PyObject *to, Py_ssize_t to_start,
1144 PyObject *from, Py_ssize_t from_start,
1145 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001147 unsigned int from_kind, to_kind;
1148 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001149 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001151 assert(PyUnicode_Check(from));
1152 assert(PyUnicode_Check(to));
1153 assert(PyUnicode_IS_READY(from));
1154 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001155
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001156 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1157 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1158 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001160 if (how_many == 0)
1161 return 0;
1162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001164 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001166 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001168#ifdef Py_DEBUG
1169 if (!check_maxchar
1170 && (from_kind > to_kind
1171 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001172 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001173 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1174 Py_UCS4 ch;
1175 Py_ssize_t i;
1176 for (i=0; i < how_many; i++) {
1177 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1178 assert(ch <= to_maxchar);
1179 }
1180 }
1181#endif
1182 fast = (from_kind == to_kind);
1183 if (check_maxchar
1184 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1185 {
1186 /* deny latin1 => ascii */
1187 fast = 0;
1188 }
1189
1190 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001191 Py_MEMCPY((char*)to_data + to_kind * to_start,
1192 (char*)from_data + from_kind * from_start,
1193 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001194 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001195 else if (from_kind == PyUnicode_1BYTE_KIND
1196 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001197 {
1198 _PyUnicode_CONVERT_BYTES(
1199 Py_UCS1, Py_UCS2,
1200 PyUnicode_1BYTE_DATA(from) + from_start,
1201 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1202 PyUnicode_2BYTE_DATA(to) + to_start
1203 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001204 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001205 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001206 && to_kind == PyUnicode_4BYTE_KIND)
1207 {
1208 _PyUnicode_CONVERT_BYTES(
1209 Py_UCS1, Py_UCS4,
1210 PyUnicode_1BYTE_DATA(from) + from_start,
1211 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1212 PyUnicode_4BYTE_DATA(to) + to_start
1213 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001214 }
1215 else if (from_kind == PyUnicode_2BYTE_KIND
1216 && to_kind == PyUnicode_4BYTE_KIND)
1217 {
1218 _PyUnicode_CONVERT_BYTES(
1219 Py_UCS2, Py_UCS4,
1220 PyUnicode_2BYTE_DATA(from) + from_start,
1221 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1222 PyUnicode_4BYTE_DATA(to) + to_start
1223 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001224 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001225 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001226 /* check if max_char(from substring) <= max_char(to) */
1227 if (from_kind > to_kind
1228 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001229 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001230 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001231 /* slow path to check for character overflow */
1232 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001233 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001234 Py_ssize_t i;
1235
Victor Stinner56c161a2011-10-06 02:47:11 +02001236#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001237 for (i=0; i < how_many; i++) {
1238 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001239 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001240 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1241 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001242#else
1243 if (!check_maxchar) {
1244 for (i=0; i < how_many; i++) {
1245 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1246 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1247 }
1248 }
1249 else {
1250 for (i=0; i < how_many; i++) {
1251 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1252 if (ch > to_maxchar)
1253 return 1;
1254 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1255 }
1256 }
1257#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001258 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001259 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001260 assert(0 && "inconsistent state");
1261 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001262 }
1263 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001264 return 0;
1265}
1266
1267static void
1268copy_characters(PyObject *to, Py_ssize_t to_start,
1269 PyObject *from, Py_ssize_t from_start,
1270 Py_ssize_t how_many)
1271{
1272 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1273}
1274
1275Py_ssize_t
1276PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1277 PyObject *from, Py_ssize_t from_start,
1278 Py_ssize_t how_many)
1279{
1280 int err;
1281
1282 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1283 PyErr_BadInternalCall();
1284 return -1;
1285 }
1286
Benjamin Petersonbac79492012-01-14 13:34:47 -05001287 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001288 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001289 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001290 return -1;
1291
1292 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1293 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1294 PyErr_Format(PyExc_SystemError,
1295 "Cannot write %zi characters at %zi "
1296 "in a string of %zi characters",
1297 how_many, to_start, PyUnicode_GET_LENGTH(to));
1298 return -1;
1299 }
1300
1301 if (how_many == 0)
1302 return 0;
1303
Victor Stinner488fa492011-12-12 00:01:39 +01001304 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001305 return -1;
1306
1307 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1308 if (err) {
1309 PyErr_Format(PyExc_SystemError,
1310 "Cannot copy %s characters "
1311 "into a string of %s characters",
1312 unicode_kind_name(from),
1313 unicode_kind_name(to));
1314 return -1;
1315 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001316 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317}
1318
Victor Stinner17222162011-09-28 22:15:37 +02001319/* Find the maximum code point and count the number of surrogate pairs so a
1320 correct string length can be computed before converting a string to UCS4.
1321 This function counts single surrogates as a character and not as a pair.
1322
1323 Return 0 on success, or -1 on error. */
1324static int
1325find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1326 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327{
1328 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001329 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330
Victor Stinnerc53be962011-10-02 21:33:54 +02001331 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 *num_surrogates = 0;
1333 *maxchar = 0;
1334
1335 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001337 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1338 && (iter+1) < end
1339 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001341 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001342 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343 iter += 2;
1344 }
1345 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001347 {
1348 ch = *iter;
1349 iter++;
1350 }
1351 if (ch > *maxchar) {
1352 *maxchar = ch;
1353 if (*maxchar > MAX_UNICODE) {
1354 PyErr_Format(PyExc_ValueError,
1355 "character U+%x is not in range [U+0000; U+10ffff]",
1356 ch);
1357 return -1;
1358 }
1359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360 }
1361 return 0;
1362}
1363
1364#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001365static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366#endif
1367
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001368int
1369_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370{
1371 wchar_t *end;
1372 Py_UCS4 maxchar = 0;
1373 Py_ssize_t num_surrogates;
1374#if SIZEOF_WCHAR_T == 2
1375 Py_ssize_t length_wo_surrogates;
1376#endif
1377
Georg Brandl7597add2011-10-05 16:36:47 +02001378 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001379 strings were created using _PyObject_New() and where no canonical
1380 representation (the str field) has been set yet aka strings
1381 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001382 assert(_PyUnicode_CHECK(unicode));
1383 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001384 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001385 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001386 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001387 /* Actually, it should neither be interned nor be anything else: */
1388 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389
1390#ifdef Py_DEBUG
1391 ++unicode_ready_calls;
1392#endif
1393
1394 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001395 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001396 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398
1399 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001400 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1401 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402 PyErr_NoMemory();
1403 return -1;
1404 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001405 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 _PyUnicode_WSTR(unicode), end,
1407 PyUnicode_1BYTE_DATA(unicode));
1408 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1409 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1410 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1411 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001412 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001413 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001414 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 }
1416 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001417 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001418 _PyUnicode_UTF8(unicode) = NULL;
1419 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 }
1421 PyObject_FREE(_PyUnicode_WSTR(unicode));
1422 _PyUnicode_WSTR(unicode) = NULL;
1423 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1424 }
1425 /* In this case we might have to convert down from 4-byte native
1426 wchar_t to 2-byte unicode. */
1427 else if (maxchar < 65536) {
1428 assert(num_surrogates == 0 &&
1429 "FindMaxCharAndNumSurrogatePairs() messed up");
1430
Victor Stinner506f5922011-09-28 22:34:18 +02001431#if SIZEOF_WCHAR_T == 2
1432 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001433 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001434 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1435 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1436 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001437 _PyUnicode_UTF8(unicode) = NULL;
1438 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001439#else
1440 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001441 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001442 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001443 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001444 PyErr_NoMemory();
1445 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446 }
Victor Stinner506f5922011-09-28 22:34:18 +02001447 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1448 _PyUnicode_WSTR(unicode), end,
1449 PyUnicode_2BYTE_DATA(unicode));
1450 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1451 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1452 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001453 _PyUnicode_UTF8(unicode) = NULL;
1454 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001455 PyObject_FREE(_PyUnicode_WSTR(unicode));
1456 _PyUnicode_WSTR(unicode) = NULL;
1457 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1458#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459 }
1460 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1461 else {
1462#if SIZEOF_WCHAR_T == 2
1463 /* in case the native representation is 2-bytes, we need to allocate a
1464 new normalized 4-byte version. */
1465 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001466 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1467 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 PyErr_NoMemory();
1469 return -1;
1470 }
1471 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1472 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001473 _PyUnicode_UTF8(unicode) = NULL;
1474 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001475 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1476 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001477 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 PyObject_FREE(_PyUnicode_WSTR(unicode));
1479 _PyUnicode_WSTR(unicode) = NULL;
1480 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1481#else
1482 assert(num_surrogates == 0);
1483
Victor Stinnerc3c74152011-10-02 20:39:55 +02001484 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001486 _PyUnicode_UTF8(unicode) = NULL;
1487 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1489#endif
1490 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1491 }
1492 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001493 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001494 return 0;
1495}
1496
Alexander Belopolsky40018472011-02-26 01:02:56 +00001497static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001498unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001499{
Walter Dörwald16807132007-05-25 13:52:07 +00001500 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001501 case SSTATE_NOT_INTERNED:
1502 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001503
Benjamin Peterson29060642009-01-31 22:14:21 +00001504 case SSTATE_INTERNED_MORTAL:
1505 /* revive dead object temporarily for DelItem */
1506 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001507 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001508 Py_FatalError(
1509 "deletion of interned string failed");
1510 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001511
Benjamin Peterson29060642009-01-31 22:14:21 +00001512 case SSTATE_INTERNED_IMMORTAL:
1513 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001514
Benjamin Peterson29060642009-01-31 22:14:21 +00001515 default:
1516 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001517 }
1518
Victor Stinner03490912011-10-03 23:45:12 +02001519 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001521 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001522 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001523 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1524 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001526 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527}
1528
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001529#ifdef Py_DEBUG
1530static int
1531unicode_is_singleton(PyObject *unicode)
1532{
1533 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1534 if (unicode == unicode_empty)
1535 return 1;
1536 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1537 {
1538 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1539 if (ch < 256 && unicode_latin1[ch] == unicode)
1540 return 1;
1541 }
1542 return 0;
1543}
1544#endif
1545
Alexander Belopolsky40018472011-02-26 01:02:56 +00001546static int
Victor Stinner488fa492011-12-12 00:01:39 +01001547unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001548{
Victor Stinner488fa492011-12-12 00:01:39 +01001549 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001550 if (Py_REFCNT(unicode) != 1)
1551 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001552 if (_PyUnicode_HASH(unicode) != -1)
1553 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001554 if (PyUnicode_CHECK_INTERNED(unicode))
1555 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001556 if (!PyUnicode_CheckExact(unicode))
1557 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001558#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001559 /* singleton refcount is greater than 1 */
1560 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001561#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001562 return 1;
1563}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001564
Victor Stinnerfe226c02011-10-03 03:52:20 +02001565static int
1566unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1567{
1568 PyObject *unicode;
1569 Py_ssize_t old_length;
1570
1571 assert(p_unicode != NULL);
1572 unicode = *p_unicode;
1573
1574 assert(unicode != NULL);
1575 assert(PyUnicode_Check(unicode));
1576 assert(0 <= length);
1577
Victor Stinner910337b2011-10-03 03:20:16 +02001578 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001579 old_length = PyUnicode_WSTR_LENGTH(unicode);
1580 else
1581 old_length = PyUnicode_GET_LENGTH(unicode);
1582 if (old_length == length)
1583 return 0;
1584
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001585 if (length == 0) {
1586 Py_DECREF(*p_unicode);
1587 *p_unicode = unicode_empty;
1588 Py_INCREF(*p_unicode);
1589 return 0;
1590 }
1591
Victor Stinner488fa492011-12-12 00:01:39 +01001592 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001593 PyObject *copy = resize_copy(unicode, length);
1594 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001595 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001596 Py_DECREF(*p_unicode);
1597 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001598 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001599 }
1600
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001602 PyObject *new_unicode = resize_compact(unicode, length);
1603 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001604 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001605 *p_unicode = new_unicode;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001606 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001607 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001608 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001609 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001610}
1611
Alexander Belopolsky40018472011-02-26 01:02:56 +00001612int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001613PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001614{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001615 PyObject *unicode;
1616 if (p_unicode == NULL) {
1617 PyErr_BadInternalCall();
1618 return -1;
1619 }
1620 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001621 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001622 {
1623 PyErr_BadInternalCall();
1624 return -1;
1625 }
1626 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001627}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001628
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001629static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001630unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001631{
1632 PyObject *result;
1633 assert(PyUnicode_IS_READY(*p_unicode));
1634 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1635 return 0;
1636 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1637 maxchar);
1638 if (result == NULL)
1639 return -1;
1640 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1641 PyUnicode_GET_LENGTH(*p_unicode));
1642 Py_DECREF(*p_unicode);
1643 *p_unicode = result;
1644 return 0;
1645}
1646
1647static int
1648unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1649 Py_UCS4 ch)
1650{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001651 assert(ch <= MAX_UNICODE);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001652 if (unicode_widen(p_unicode, ch) < 0)
1653 return -1;
1654 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1655 PyUnicode_DATA(*p_unicode),
1656 (*pos)++, ch);
1657 return 0;
1658}
1659
Victor Stinnerc5166102012-02-22 13:55:02 +01001660/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1661 Return the length of the input string.
1662
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001663 WARNING: The function doesn't copy the terminating null character and
1664 doesn't check the maximum character (may write a latin1 character in an
1665 ASCII string). */
Victor Stinnerc5166102012-02-22 13:55:02 +01001666static Py_ssize_t
1667unicode_write_cstr(PyObject *unicode, Py_ssize_t index, const char *str)
1668{
1669 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1670 void *data = PyUnicode_DATA(unicode);
1671
1672 switch (kind) {
1673 case PyUnicode_1BYTE_KIND: {
1674 Py_ssize_t len = strlen(str);
1675 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001676 memcpy((char *) data + index, str, len);
Victor Stinnerc5166102012-02-22 13:55:02 +01001677 return len;
1678 }
1679 case PyUnicode_2BYTE_KIND: {
1680 Py_UCS2 *start = (Py_UCS2 *)data + index;
1681 Py_UCS2 *ucs2 = start;
1682 assert(index <= PyUnicode_GET_LENGTH(unicode));
1683
1684 for (; *str; ++ucs2, ++str)
1685 *ucs2 = (Py_UCS2)*str;
1686
1687 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1688 return ucs2 - start;
1689 }
1690 default: {
1691 Py_UCS4 *start = (Py_UCS4 *)data + index;
1692 Py_UCS4 *ucs4 = start;
1693 assert(kind == PyUnicode_4BYTE_KIND);
1694 assert(index <= PyUnicode_GET_LENGTH(unicode));
1695
1696 for (; *str; ++ucs4, ++str)
1697 *ucs4 = (Py_UCS4)*str;
1698
1699 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1700 return ucs4 - start;
1701 }
1702 }
1703}
1704
1705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001706static PyObject*
1707get_latin1_char(unsigned char ch)
1708{
Victor Stinnera464fc12011-10-02 20:39:30 +02001709 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001711 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001712 if (!unicode)
1713 return NULL;
1714 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001715 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001716 unicode_latin1[ch] = unicode;
1717 }
1718 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001719 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001720}
1721
Alexander Belopolsky40018472011-02-26 01:02:56 +00001722PyObject *
1723PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001725 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001726 Py_UCS4 maxchar = 0;
1727 Py_ssize_t num_surrogates;
1728
1729 if (u == NULL)
1730 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001732 /* If the Unicode data is known at construction time, we can apply
1733 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 /* Optimization for empty strings */
1736 if (size == 0 && unicode_empty != NULL) {
1737 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001738 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001739 }
Tim Petersced69f82003-09-16 20:30:58 +00001740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 /* Single character Unicode objects in the Latin-1 range are
1742 shared when using this constructor */
1743 if (size == 1 && *u < 256)
1744 return get_latin1_char((unsigned char)*u);
1745
1746 /* If not empty and not single character, copy the Unicode data
1747 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001748 if (find_maxchar_surrogates(u, u + size,
1749 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 return NULL;
1751
Victor Stinner8faf8212011-12-08 22:14:11 +01001752 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753 if (!unicode)
1754 return NULL;
1755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 switch (PyUnicode_KIND(unicode)) {
1757 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001758 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1760 break;
1761 case PyUnicode_2BYTE_KIND:
1762#if Py_UNICODE_SIZE == 2
1763 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1764#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001765 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1767#endif
1768 break;
1769 case PyUnicode_4BYTE_KIND:
1770#if SIZEOF_WCHAR_T == 2
1771 /* This is the only case which has to process surrogates, thus
1772 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001773 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001774#else
1775 assert(num_surrogates == 0);
1776 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1777#endif
1778 break;
1779 default:
1780 assert(0 && "Impossible state");
1781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001783 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784}
1785
Alexander Belopolsky40018472011-02-26 01:02:56 +00001786PyObject *
1787PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001788{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001789 if (size < 0) {
1790 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001791 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001792 return NULL;
1793 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001794 if (u != NULL)
1795 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1796 else
1797 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001798}
1799
Alexander Belopolsky40018472011-02-26 01:02:56 +00001800PyObject *
1801PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001802{
1803 size_t size = strlen(u);
1804 if (size > PY_SSIZE_T_MAX) {
1805 PyErr_SetString(PyExc_OverflowError, "input too long");
1806 return NULL;
1807 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001808 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001809}
1810
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001811PyObject *
1812_PyUnicode_FromId(_Py_Identifier *id)
1813{
1814 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001815 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1816 strlen(id->string),
1817 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001818 if (!id->object)
1819 return NULL;
1820 PyUnicode_InternInPlace(&id->object);
1821 assert(!id->next);
1822 id->next = static_strings;
1823 static_strings = id;
1824 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001825 return id->object;
1826}
1827
1828void
1829_PyUnicode_ClearStaticStrings()
1830{
1831 _Py_Identifier *i;
1832 for (i = static_strings; i; i = i->next) {
1833 Py_DECREF(i->object);
1834 i->object = NULL;
1835 i->next = NULL;
1836 }
1837}
1838
Benjamin Peterson0df54292012-03-26 14:50:32 -04001839/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001840
Victor Stinnere57b1c02011-09-28 22:20:48 +02001841static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001842unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001843{
Victor Stinner785938e2011-12-11 20:09:03 +01001844 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001845 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001846#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001847 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001848#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001849 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001850 }
Victor Stinner785938e2011-12-11 20:09:03 +01001851 unicode = PyUnicode_New(size, 127);
1852 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001853 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001854 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1855 assert(_PyUnicode_CheckConsistency(unicode, 1));
1856 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001857}
1858
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001859static Py_UCS4
1860kind_maxchar_limit(unsigned int kind)
1861{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001862 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001863 case PyUnicode_1BYTE_KIND:
1864 return 0x80;
1865 case PyUnicode_2BYTE_KIND:
1866 return 0x100;
1867 case PyUnicode_4BYTE_KIND:
1868 return 0x10000;
1869 default:
1870 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001871 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001872 }
1873}
1874
Victor Stinnere6abb482012-05-02 01:15:40 +02001875Py_LOCAL_INLINE(Py_UCS4)
1876align_maxchar(Py_UCS4 maxchar)
1877{
1878 if (maxchar <= 127)
1879 return 127;
1880 else if (maxchar <= 255)
1881 return 255;
1882 else if (maxchar <= 65535)
1883 return 65535;
1884 else
1885 return MAX_UNICODE;
1886}
1887
Victor Stinner702c7342011-10-05 13:50:52 +02001888static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001889_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001890{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001891 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001892 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001893
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001894 if (size == 0) {
1895 Py_INCREF(unicode_empty);
1896 return unicode_empty;
1897 }
1898 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001899 if (size == 1)
1900 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001901
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001902 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001903 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001904 if (!res)
1905 return NULL;
1906 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001907 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001909}
1910
Victor Stinnere57b1c02011-09-28 22:20:48 +02001911static PyObject*
1912_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913{
1914 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001915 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001916
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001917 if (size == 0) {
1918 Py_INCREF(unicode_empty);
1919 return unicode_empty;
1920 }
1921 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001922 if (size == 1) {
1923 Py_UCS4 ch = u[0];
1924 if (ch < 256)
1925 return get_latin1_char((unsigned char)ch);
1926
1927 res = PyUnicode_New(1, ch);
1928 if (res == NULL)
1929 return NULL;
1930 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1931 assert(_PyUnicode_CheckConsistency(res, 1));
1932 return res;
1933 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001934
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001935 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001936 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001937 if (!res)
1938 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001939 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001941 else {
1942 _PyUnicode_CONVERT_BYTES(
1943 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1944 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001945 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001946 return res;
1947}
1948
Victor Stinnere57b1c02011-09-28 22:20:48 +02001949static PyObject*
1950_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001951{
1952 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001953 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001954
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001955 if (size == 0) {
1956 Py_INCREF(unicode_empty);
1957 return unicode_empty;
1958 }
1959 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001960 if (size == 1) {
1961 Py_UCS4 ch = u[0];
1962 if (ch < 256)
1963 return get_latin1_char((unsigned char)ch);
1964
1965 res = PyUnicode_New(1, ch);
1966 if (res == NULL)
1967 return NULL;
1968 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1969 assert(_PyUnicode_CheckConsistency(res, 1));
1970 return res;
1971 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001972
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001973 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001974 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001975 if (!res)
1976 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001977 if (max_char < 256)
1978 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1979 PyUnicode_1BYTE_DATA(res));
1980 else if (max_char < 0x10000)
1981 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1982 PyUnicode_2BYTE_DATA(res));
1983 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001985 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986 return res;
1987}
1988
1989PyObject*
1990PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1991{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001992 if (size < 0) {
1993 PyErr_SetString(PyExc_ValueError, "size must be positive");
1994 return NULL;
1995 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001996 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001998 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002000 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002002 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002003 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002004 PyErr_SetString(PyExc_SystemError, "invalid kind");
2005 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002007}
2008
Victor Stinnerece58de2012-04-23 23:36:38 +02002009Py_UCS4
2010_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2011{
2012 enum PyUnicode_Kind kind;
2013 void *startptr, *endptr;
2014
2015 assert(PyUnicode_IS_READY(unicode));
2016 assert(0 <= start);
2017 assert(end <= PyUnicode_GET_LENGTH(unicode));
2018 assert(start <= end);
2019
2020 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2021 return PyUnicode_MAX_CHAR_VALUE(unicode);
2022
2023 if (start == end)
2024 return 127;
2025
Victor Stinner94d558b2012-04-27 22:26:58 +02002026 if (PyUnicode_IS_ASCII(unicode))
2027 return 127;
2028
Victor Stinnerece58de2012-04-23 23:36:38 +02002029 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002030 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002031 endptr = (char *)startptr + end * kind;
2032 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002033 switch(kind) {
2034 case PyUnicode_1BYTE_KIND:
2035 return ucs1lib_find_max_char(startptr, endptr);
2036 case PyUnicode_2BYTE_KIND:
2037 return ucs2lib_find_max_char(startptr, endptr);
2038 case PyUnicode_4BYTE_KIND:
2039 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002040 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002041 assert(0);
2042 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002043 }
2044}
2045
Victor Stinner25a4b292011-10-06 12:31:55 +02002046/* Ensure that a string uses the most efficient storage, if it is not the
2047 case: create a new string with of the right kind. Write NULL into *p_unicode
2048 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002049static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002050unicode_adjust_maxchar(PyObject **p_unicode)
2051{
2052 PyObject *unicode, *copy;
2053 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002054 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002055 unsigned int kind;
2056
2057 assert(p_unicode != NULL);
2058 unicode = *p_unicode;
2059 assert(PyUnicode_IS_READY(unicode));
2060 if (PyUnicode_IS_ASCII(unicode))
2061 return;
2062
2063 len = PyUnicode_GET_LENGTH(unicode);
2064 kind = PyUnicode_KIND(unicode);
2065 if (kind == PyUnicode_1BYTE_KIND) {
2066 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002067 max_char = ucs1lib_find_max_char(u, u + len);
2068 if (max_char >= 128)
2069 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002070 }
2071 else if (kind == PyUnicode_2BYTE_KIND) {
2072 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002073 max_char = ucs2lib_find_max_char(u, u + len);
2074 if (max_char >= 256)
2075 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002076 }
2077 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002078 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002079 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002080 max_char = ucs4lib_find_max_char(u, u + len);
2081 if (max_char >= 0x10000)
2082 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002083 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002084 copy = PyUnicode_New(len, max_char);
2085 copy_characters(copy, 0, unicode, 0, len);
2086 Py_DECREF(unicode);
2087 *p_unicode = copy;
2088}
2089
Victor Stinner034f6cf2011-09-30 02:26:44 +02002090PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002091_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002092{
Victor Stinner87af4f22011-11-21 23:03:47 +01002093 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002094 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002095
Victor Stinner034f6cf2011-09-30 02:26:44 +02002096 if (!PyUnicode_Check(unicode)) {
2097 PyErr_BadInternalCall();
2098 return NULL;
2099 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002100 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002101 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002102
Victor Stinner87af4f22011-11-21 23:03:47 +01002103 length = PyUnicode_GET_LENGTH(unicode);
2104 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002105 if (!copy)
2106 return NULL;
2107 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2108
Victor Stinner87af4f22011-11-21 23:03:47 +01002109 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2110 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002111 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002112 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002113}
2114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002115
Victor Stinnerbc603d12011-10-02 01:00:40 +02002116/* Widen Unicode objects to larger buffers. Don't write terminating null
2117 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002118
2119void*
2120_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2121{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002122 Py_ssize_t len;
2123 void *result;
2124 unsigned int skind;
2125
Benjamin Petersonbac79492012-01-14 13:34:47 -05002126 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002127 return NULL;
2128
2129 len = PyUnicode_GET_LENGTH(s);
2130 skind = PyUnicode_KIND(s);
2131 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002132 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002133 return NULL;
2134 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002135 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002136 case PyUnicode_2BYTE_KIND:
2137 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2138 if (!result)
2139 return PyErr_NoMemory();
2140 assert(skind == PyUnicode_1BYTE_KIND);
2141 _PyUnicode_CONVERT_BYTES(
2142 Py_UCS1, Py_UCS2,
2143 PyUnicode_1BYTE_DATA(s),
2144 PyUnicode_1BYTE_DATA(s) + len,
2145 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002147 case PyUnicode_4BYTE_KIND:
2148 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2149 if (!result)
2150 return PyErr_NoMemory();
2151 if (skind == PyUnicode_2BYTE_KIND) {
2152 _PyUnicode_CONVERT_BYTES(
2153 Py_UCS2, Py_UCS4,
2154 PyUnicode_2BYTE_DATA(s),
2155 PyUnicode_2BYTE_DATA(s) + len,
2156 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002158 else {
2159 assert(skind == PyUnicode_1BYTE_KIND);
2160 _PyUnicode_CONVERT_BYTES(
2161 Py_UCS1, Py_UCS4,
2162 PyUnicode_1BYTE_DATA(s),
2163 PyUnicode_1BYTE_DATA(s) + len,
2164 result);
2165 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002167 default:
2168 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 }
Victor Stinner01698042011-10-04 00:04:26 +02002170 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 return NULL;
2172}
2173
2174static Py_UCS4*
2175as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2176 int copy_null)
2177{
2178 int kind;
2179 void *data;
2180 Py_ssize_t len, targetlen;
2181 if (PyUnicode_READY(string) == -1)
2182 return NULL;
2183 kind = PyUnicode_KIND(string);
2184 data = PyUnicode_DATA(string);
2185 len = PyUnicode_GET_LENGTH(string);
2186 targetlen = len;
2187 if (copy_null)
2188 targetlen++;
2189 if (!target) {
2190 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2191 PyErr_NoMemory();
2192 return NULL;
2193 }
2194 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2195 if (!target) {
2196 PyErr_NoMemory();
2197 return NULL;
2198 }
2199 }
2200 else {
2201 if (targetsize < targetlen) {
2202 PyErr_Format(PyExc_SystemError,
2203 "string is longer than the buffer");
2204 if (copy_null && 0 < targetsize)
2205 target[0] = 0;
2206 return NULL;
2207 }
2208 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002209 if (kind == PyUnicode_1BYTE_KIND) {
2210 Py_UCS1 *start = (Py_UCS1 *) data;
2211 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002213 else if (kind == PyUnicode_2BYTE_KIND) {
2214 Py_UCS2 *start = (Py_UCS2 *) data;
2215 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2216 }
2217 else {
2218 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002220 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 if (copy_null)
2222 target[len] = 0;
2223 return target;
2224}
2225
2226Py_UCS4*
2227PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2228 int copy_null)
2229{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002230 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 PyErr_BadInternalCall();
2232 return NULL;
2233 }
2234 return as_ucs4(string, target, targetsize, copy_null);
2235}
2236
2237Py_UCS4*
2238PyUnicode_AsUCS4Copy(PyObject *string)
2239{
2240 return as_ucs4(string, NULL, 0, 1);
2241}
2242
2243#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002244
Alexander Belopolsky40018472011-02-26 01:02:56 +00002245PyObject *
2246PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002249 if (size == 0) {
2250 Py_INCREF(unicode_empty);
2251 return unicode_empty;
2252 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002253 PyErr_BadInternalCall();
2254 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002255 }
2256
Martin v. Löwis790465f2008-04-05 20:41:37 +00002257 if (size == -1) {
2258 size = wcslen(w);
2259 }
2260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262}
2263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002265
Walter Dörwald346737f2007-05-31 10:44:43 +00002266static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002267makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2268 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002269{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002270 *fmt++ = '%';
2271 if (width) {
2272 if (zeropad)
2273 *fmt++ = '0';
2274 fmt += sprintf(fmt, "%d", width);
2275 }
2276 if (precision)
2277 fmt += sprintf(fmt, ".%d", precision);
2278 if (longflag)
2279 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002280 else if (longlongflag) {
2281 /* longlongflag should only ever be nonzero on machines with
2282 HAVE_LONG_LONG defined */
2283#ifdef HAVE_LONG_LONG
2284 char *f = PY_FORMAT_LONG_LONG;
2285 while (*f)
2286 *fmt++ = *f++;
2287#else
2288 /* we shouldn't ever get here */
2289 assert(0);
2290 *fmt++ = 'l';
2291#endif
2292 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002293 else if (size_tflag) {
2294 char *f = PY_FORMAT_SIZE_T;
2295 while (*f)
2296 *fmt++ = *f++;
2297 }
2298 *fmt++ = c;
2299 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002300}
2301
Victor Stinner96865452011-03-01 23:44:09 +00002302/* helper for PyUnicode_FromFormatV() */
2303
2304static const char*
2305parse_format_flags(const char *f,
2306 int *p_width, int *p_precision,
2307 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2308{
2309 int width, precision, longflag, longlongflag, size_tflag;
2310
2311 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2312 f++;
2313 width = 0;
2314 while (Py_ISDIGIT((unsigned)*f))
2315 width = (width*10) + *f++ - '0';
2316 precision = 0;
2317 if (*f == '.') {
2318 f++;
2319 while (Py_ISDIGIT((unsigned)*f))
2320 precision = (precision*10) + *f++ - '0';
2321 if (*f == '%') {
2322 /* "%.3%s" => f points to "3" */
2323 f--;
2324 }
2325 }
2326 if (*f == '\0') {
2327 /* bogus format "%.1" => go backward, f points to "1" */
2328 f--;
2329 }
2330 if (p_width != NULL)
2331 *p_width = width;
2332 if (p_precision != NULL)
2333 *p_precision = precision;
2334
2335 /* Handle %ld, %lu, %lld and %llu. */
2336 longflag = 0;
2337 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002338 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002339
2340 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002341 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002342 longflag = 1;
2343 ++f;
2344 }
2345#ifdef HAVE_LONG_LONG
2346 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002347 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002348 longlongflag = 1;
2349 f += 2;
2350 }
2351#endif
2352 }
2353 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002354 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002355 size_tflag = 1;
2356 ++f;
2357 }
2358 if (p_longflag != NULL)
2359 *p_longflag = longflag;
2360 if (p_longlongflag != NULL)
2361 *p_longlongflag = longlongflag;
2362 if (p_size_tflag != NULL)
2363 *p_size_tflag = size_tflag;
2364 return f;
2365}
2366
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002367/* maximum number of characters required for output of %ld. 21 characters
2368 allows for 64-bit integers (in decimal) and an optional sign. */
2369#define MAX_LONG_CHARS 21
2370/* maximum number of characters required for output of %lld.
2371 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2372 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2373#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2374
Walter Dörwaldd2034312007-05-18 16:29:38 +00002375PyObject *
2376PyUnicode_FromFormatV(const char *format, va_list vargs)
2377{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002378 va_list count;
2379 Py_ssize_t callcount = 0;
2380 PyObject **callresults = NULL;
2381 PyObject **callresult = NULL;
2382 Py_ssize_t n = 0;
2383 int width = 0;
2384 int precision = 0;
2385 int zeropad;
2386 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002387 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002388 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002389 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002390 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2391 Py_UCS4 argmaxchar;
2392 Py_ssize_t numbersize = 0;
2393 char *numberresults = NULL;
2394 char *numberresult = NULL;
2395 Py_ssize_t i;
2396 int kind;
2397 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002398
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002399 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002400 /* step 1: count the number of %S/%R/%A/%s format specifications
2401 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2402 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002404 * also estimate a upper bound for all the number formats in the string,
2405 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002407 for (f = format; *f; f++) {
2408 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002409 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002410 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2411 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2412 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2413 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002416#ifdef HAVE_LONG_LONG
2417 if (longlongflag) {
2418 if (width < MAX_LONG_LONG_CHARS)
2419 width = MAX_LONG_LONG_CHARS;
2420 }
2421 else
2422#endif
2423 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2424 including sign. Decimal takes the most space. This
2425 isn't enough for octal. If a width is specified we
2426 need more (which we allocate later). */
2427 if (width < MAX_LONG_CHARS)
2428 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002429
2430 /* account for the size + '\0' to separate numbers
2431 inside of the numberresults buffer */
2432 numbersize += (width + 1);
2433 }
2434 }
2435 else if ((unsigned char)*f > 127) {
2436 PyErr_Format(PyExc_ValueError,
2437 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2438 "string, got a non-ASCII byte: 0x%02x",
2439 (unsigned char)*f);
2440 return NULL;
2441 }
2442 }
2443 /* step 2: allocate memory for the results of
2444 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2445 if (callcount) {
2446 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2447 if (!callresults) {
2448 PyErr_NoMemory();
2449 return NULL;
2450 }
2451 callresult = callresults;
2452 }
2453 /* step 2.5: allocate memory for the results of formating numbers */
2454 if (numbersize) {
2455 numberresults = PyObject_Malloc(numbersize);
2456 if (!numberresults) {
2457 PyErr_NoMemory();
2458 goto fail;
2459 }
2460 numberresult = numberresults;
2461 }
2462
2463 /* step 3: format numbers and figure out how large a buffer we need */
2464 for (f = format; *f; f++) {
2465 if (*f == '%') {
2466 const char* p;
2467 int longflag;
2468 int longlongflag;
2469 int size_tflag;
2470 int numprinted;
2471
2472 p = f;
2473 zeropad = (f[1] == '0');
2474 f = parse_format_flags(f, &width, &precision,
2475 &longflag, &longlongflag, &size_tflag);
2476 switch (*f) {
2477 case 'c':
2478 {
2479 Py_UCS4 ordinal = va_arg(count, int);
Victor Stinnere6abb482012-05-02 01:15:40 +02002480 maxchar = MAX_MAXCHAR(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002481 n++;
2482 break;
2483 }
2484 case '%':
2485 n++;
2486 break;
2487 case 'i':
2488 case 'd':
2489 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2490 width, precision, *f);
2491 if (longflag)
2492 numprinted = sprintf(numberresult, fmt,
2493 va_arg(count, long));
2494#ifdef HAVE_LONG_LONG
2495 else if (longlongflag)
2496 numprinted = sprintf(numberresult, fmt,
2497 va_arg(count, PY_LONG_LONG));
2498#endif
2499 else if (size_tflag)
2500 numprinted = sprintf(numberresult, fmt,
2501 va_arg(count, Py_ssize_t));
2502 else
2503 numprinted = sprintf(numberresult, fmt,
2504 va_arg(count, int));
2505 n += numprinted;
2506 /* advance by +1 to skip over the '\0' */
2507 numberresult += (numprinted + 1);
2508 assert(*(numberresult - 1) == '\0');
2509 assert(*(numberresult - 2) != '\0');
2510 assert(numprinted >= 0);
2511 assert(numberresult <= numberresults + numbersize);
2512 break;
2513 case 'u':
2514 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2515 width, precision, 'u');
2516 if (longflag)
2517 numprinted = sprintf(numberresult, fmt,
2518 va_arg(count, unsigned long));
2519#ifdef HAVE_LONG_LONG
2520 else if (longlongflag)
2521 numprinted = sprintf(numberresult, fmt,
2522 va_arg(count, unsigned PY_LONG_LONG));
2523#endif
2524 else if (size_tflag)
2525 numprinted = sprintf(numberresult, fmt,
2526 va_arg(count, size_t));
2527 else
2528 numprinted = sprintf(numberresult, fmt,
2529 va_arg(count, unsigned int));
2530 n += numprinted;
2531 numberresult += (numprinted + 1);
2532 assert(*(numberresult - 1) == '\0');
2533 assert(*(numberresult - 2) != '\0');
2534 assert(numprinted >= 0);
2535 assert(numberresult <= numberresults + numbersize);
2536 break;
2537 case 'x':
2538 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2539 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2540 n += numprinted;
2541 numberresult += (numprinted + 1);
2542 assert(*(numberresult - 1) == '\0');
2543 assert(*(numberresult - 2) != '\0');
2544 assert(numprinted >= 0);
2545 assert(numberresult <= numberresults + numbersize);
2546 break;
2547 case 'p':
2548 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2549 /* %p is ill-defined: ensure leading 0x. */
2550 if (numberresult[1] == 'X')
2551 numberresult[1] = 'x';
2552 else if (numberresult[1] != 'x') {
2553 memmove(numberresult + 2, numberresult,
2554 strlen(numberresult) + 1);
2555 numberresult[0] = '0';
2556 numberresult[1] = 'x';
2557 numprinted += 2;
2558 }
2559 n += numprinted;
2560 numberresult += (numprinted + 1);
2561 assert(*(numberresult - 1) == '\0');
2562 assert(*(numberresult - 2) != '\0');
2563 assert(numprinted >= 0);
2564 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002565 break;
2566 case 's':
2567 {
2568 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002569 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002570 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002571 if (!str)
2572 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002573 /* since PyUnicode_DecodeUTF8 returns already flexible
2574 unicode objects, there is no need to call ready on them */
2575 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002576 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002577 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002578 /* Remember the str and switch to the next slot */
2579 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002580 break;
2581 }
2582 case 'U':
2583 {
2584 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002585 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002586 if (PyUnicode_READY(obj) == -1)
2587 goto fail;
2588 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002589 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 break;
2592 }
2593 case 'V':
2594 {
2595 PyObject *obj = va_arg(count, PyObject *);
2596 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002597 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002598 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002599 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002600 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002601 if (PyUnicode_READY(obj) == -1)
2602 goto fail;
2603 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002604 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002605 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002606 *callresult++ = NULL;
2607 }
2608 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002609 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002610 if (!str_obj)
2611 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002612 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002613 Py_DECREF(str_obj);
2614 goto fail;
2615 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002616 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002617 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002618 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002619 *callresult++ = str_obj;
2620 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002621 break;
2622 }
2623 case 'S':
2624 {
2625 PyObject *obj = va_arg(count, PyObject *);
2626 PyObject *str;
2627 assert(obj);
2628 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002629 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002630 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002631 if (PyUnicode_READY(str) == -1) {
2632 Py_DECREF(str);
2633 goto fail;
2634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002636 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 /* Remember the str and switch to the next slot */
2639 *callresult++ = str;
2640 break;
2641 }
2642 case 'R':
2643 {
2644 PyObject *obj = va_arg(count, PyObject *);
2645 PyObject *repr;
2646 assert(obj);
2647 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002648 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002649 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002650 if (PyUnicode_READY(repr) == -1) {
2651 Py_DECREF(repr);
2652 goto fail;
2653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002654 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Victor Stinnere6abb482012-05-02 01:15:40 +02002655 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002657 /* Remember the repr and switch to the next slot */
2658 *callresult++ = repr;
2659 break;
2660 }
2661 case 'A':
2662 {
2663 PyObject *obj = va_arg(count, PyObject *);
2664 PyObject *ascii;
2665 assert(obj);
2666 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002667 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002669 if (PyUnicode_READY(ascii) == -1) {
2670 Py_DECREF(ascii);
2671 goto fail;
2672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Victor Stinnere6abb482012-05-02 01:15:40 +02002674 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002675 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002676 /* Remember the repr and switch to the next slot */
2677 *callresult++ = ascii;
2678 break;
2679 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002680 default:
2681 /* if we stumble upon an unknown
2682 formatting code, copy the rest of
2683 the format string to the output
2684 string. (we cannot just skip the
2685 code, since there's no way to know
2686 what's in the argument list) */
2687 n += strlen(p);
2688 goto expand;
2689 }
2690 } else
2691 n++;
2692 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002693 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002695 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002696 we don't have to resize the string.
2697 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002698 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002699 if (!string)
2700 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002701 kind = PyUnicode_KIND(string);
2702 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002703 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002704 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002706 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002707 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002708 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002709
2710 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2712 /* checking for == because the last argument could be a empty
2713 string, which causes i to point to end, the assert at the end of
2714 the loop */
2715 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002716
Benjamin Peterson14339b62009-01-31 16:36:08 +00002717 switch (*f) {
2718 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002719 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002720 const int ordinal = va_arg(vargs, int);
2721 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002722 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002723 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002724 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002725 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002726 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002727 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002728 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002729 {
2730 Py_ssize_t written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002731 /* unused, since we already have the result */
2732 if (*f == 'p')
2733 (void) va_arg(vargs, void *);
2734 else
2735 (void) va_arg(vargs, int);
2736 /* extract the result from numberresults and append. */
Victor Stinnerc5166102012-02-22 13:55:02 +01002737 written = unicode_write_cstr(string, i, numberresult);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002738 /* skip over the separating '\0' */
Victor Stinnerc5166102012-02-22 13:55:02 +01002739 i += written;
2740 numberresult += written;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 assert(*numberresult == '\0');
2742 numberresult++;
2743 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002744 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002745 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002746 case 's':
2747 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002748 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002749 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002750 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 size = PyUnicode_GET_LENGTH(*callresult);
2752 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002753 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002754 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002755 /* We're done with the unicode()/repr() => forget it */
2756 Py_DECREF(*callresult);
2757 /* switch to next unicode()/repr() result */
2758 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002759 break;
2760 }
2761 case 'U':
2762 {
2763 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002764 Py_ssize_t size;
2765 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2766 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002767 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002768 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002769 break;
2770 }
2771 case 'V':
2772 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002773 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002774 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002775 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002776 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002777 size = PyUnicode_GET_LENGTH(obj);
2778 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002779 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002780 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002781 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002782 size = PyUnicode_GET_LENGTH(*callresult);
2783 assert(PyUnicode_KIND(*callresult) <=
2784 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002785 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002786 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002787 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002788 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002789 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002790 break;
2791 }
2792 case 'S':
2793 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002794 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002795 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002796 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002797 /* unused, since we already have the result */
2798 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002799 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002800 copy_characters(string, i, *callresult, 0, size);
2801 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002802 /* We're done with the unicode()/repr() => forget it */
2803 Py_DECREF(*callresult);
2804 /* switch to next unicode()/repr() result */
2805 ++callresult;
2806 break;
2807 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002808 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002809 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002810 break;
2811 default:
Victor Stinnerc5166102012-02-22 13:55:02 +01002812 i += unicode_write_cstr(string, i, p);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002813 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002814 goto end;
2815 }
Victor Stinner1205f272010-09-11 00:54:47 +00002816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002817 else {
2818 assert(i < PyUnicode_GET_LENGTH(string));
2819 PyUnicode_WRITE(kind, data, i++, *f);
2820 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002821 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002822 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002823
Benjamin Peterson29060642009-01-31 22:14:21 +00002824 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002825 if (callresults)
2826 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002827 if (numberresults)
2828 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002829 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002830 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002831 if (callresults) {
2832 PyObject **callresult2 = callresults;
2833 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002834 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002835 ++callresult2;
2836 }
2837 PyObject_Free(callresults);
2838 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002839 if (numberresults)
2840 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002841 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002842}
2843
Walter Dörwaldd2034312007-05-18 16:29:38 +00002844PyObject *
2845PyUnicode_FromFormat(const char *format, ...)
2846{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002847 PyObject* ret;
2848 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002849
2850#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002851 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002852#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002853 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002854#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002855 ret = PyUnicode_FromFormatV(format, vargs);
2856 va_end(vargs);
2857 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002858}
2859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002860#ifdef HAVE_WCHAR_H
2861
Victor Stinner5593d8a2010-10-02 11:11:27 +00002862/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2863 convert a Unicode object to a wide character string.
2864
Victor Stinnerd88d9832011-09-06 02:00:05 +02002865 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002866 character) required to convert the unicode object. Ignore size argument.
2867
Victor Stinnerd88d9832011-09-06 02:00:05 +02002868 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002869 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002870 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002871static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002872unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002873 wchar_t *w,
2874 Py_ssize_t size)
2875{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002876 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002877 const wchar_t *wstr;
2878
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002879 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002880 if (wstr == NULL)
2881 return -1;
2882
Victor Stinner5593d8a2010-10-02 11:11:27 +00002883 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002884 if (size > res)
2885 size = res + 1;
2886 else
2887 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002888 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002889 return res;
2890 }
2891 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002892 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002893}
2894
2895Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002896PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002897 wchar_t *w,
2898 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002899{
2900 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002901 PyErr_BadInternalCall();
2902 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002903 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002904 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905}
2906
Victor Stinner137c34c2010-09-29 10:25:54 +00002907wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002908PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002909 Py_ssize_t *size)
2910{
2911 wchar_t* buffer;
2912 Py_ssize_t buflen;
2913
2914 if (unicode == NULL) {
2915 PyErr_BadInternalCall();
2916 return NULL;
2917 }
2918
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002919 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002920 if (buflen == -1)
2921 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002922 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002923 PyErr_NoMemory();
2924 return NULL;
2925 }
2926
Victor Stinner137c34c2010-09-29 10:25:54 +00002927 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2928 if (buffer == NULL) {
2929 PyErr_NoMemory();
2930 return NULL;
2931 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002932 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002933 if (buflen == -1)
2934 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002935 if (size != NULL)
2936 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002937 return buffer;
2938}
2939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002940#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002941
Alexander Belopolsky40018472011-02-26 01:02:56 +00002942PyObject *
2943PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002944{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002945 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002946 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002947 PyErr_SetString(PyExc_ValueError,
2948 "chr() arg not in range(0x110000)");
2949 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002950 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002952 if (ordinal < 256)
2953 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002955 v = PyUnicode_New(1, ordinal);
2956 if (v == NULL)
2957 return NULL;
2958 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002959 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002960 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002961}
2962
Alexander Belopolsky40018472011-02-26 01:02:56 +00002963PyObject *
2964PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002966 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002967 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002968 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002969 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002970 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002971 Py_INCREF(obj);
2972 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002973 }
2974 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002975 /* For a Unicode subtype that's not a Unicode object,
2976 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002977 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002978 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002979 PyErr_Format(PyExc_TypeError,
2980 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002981 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002982 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002983}
2984
Alexander Belopolsky40018472011-02-26 01:02:56 +00002985PyObject *
2986PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002987 const char *encoding,
2988 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002989{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002990 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002991 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002992
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002994 PyErr_BadInternalCall();
2995 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002997
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002998 /* Decoding bytes objects is the most common case and should be fast */
2999 if (PyBytes_Check(obj)) {
3000 if (PyBytes_GET_SIZE(obj) == 0) {
3001 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003002 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003003 }
3004 else {
3005 v = PyUnicode_Decode(
3006 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3007 encoding, errors);
3008 }
3009 return v;
3010 }
3011
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003012 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003013 PyErr_SetString(PyExc_TypeError,
3014 "decoding str is not supported");
3015 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003016 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003017
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003018 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3019 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3020 PyErr_Format(PyExc_TypeError,
3021 "coercing to str: need bytes, bytearray "
3022 "or buffer-like object, %.80s found",
3023 Py_TYPE(obj)->tp_name);
3024 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003025 }
Tim Petersced69f82003-09-16 20:30:58 +00003026
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003027 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003028 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003029 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 }
Tim Petersced69f82003-09-16 20:30:58 +00003031 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003032 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003033
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003034 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003035 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036}
3037
Victor Stinner600d3be2010-06-10 12:00:55 +00003038/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003039 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3040 1 on success. */
3041static int
3042normalize_encoding(const char *encoding,
3043 char *lower,
3044 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003045{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003046 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003047 char *l;
3048 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003049
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003050 if (encoding == NULL) {
3051 strcpy(lower, "utf-8");
3052 return 1;
3053 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003054 e = encoding;
3055 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003056 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003057 while (*e) {
3058 if (l == l_end)
3059 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003060 if (Py_ISUPPER(*e)) {
3061 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003062 }
3063 else if (*e == '_') {
3064 *l++ = '-';
3065 e++;
3066 }
3067 else {
3068 *l++ = *e++;
3069 }
3070 }
3071 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003072 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003073}
3074
Alexander Belopolsky40018472011-02-26 01:02:56 +00003075PyObject *
3076PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003077 Py_ssize_t size,
3078 const char *encoding,
3079 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003080{
3081 PyObject *buffer = NULL, *unicode;
3082 Py_buffer info;
3083 char lower[11]; /* Enough for any encoding shortcut */
3084
Fred Drakee4315f52000-05-09 19:53:39 +00003085 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003086 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003087 if ((strcmp(lower, "utf-8") == 0) ||
3088 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003089 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003090 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003091 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003092 (strcmp(lower, "iso-8859-1") == 0))
3093 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003094#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003095 else if (strcmp(lower, "mbcs") == 0)
3096 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003097#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003098 else if (strcmp(lower, "ascii") == 0)
3099 return PyUnicode_DecodeASCII(s, size, errors);
3100 else if (strcmp(lower, "utf-16") == 0)
3101 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3102 else if (strcmp(lower, "utf-32") == 0)
3103 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3104 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105
3106 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003107 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003108 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003109 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003110 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111 if (buffer == NULL)
3112 goto onError;
3113 unicode = PyCodec_Decode(buffer, encoding, errors);
3114 if (unicode == NULL)
3115 goto onError;
3116 if (!PyUnicode_Check(unicode)) {
3117 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003118 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003119 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 Py_DECREF(unicode);
3121 goto onError;
3122 }
3123 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003124 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003125
Benjamin Peterson29060642009-01-31 22:14:21 +00003126 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 Py_XDECREF(buffer);
3128 return NULL;
3129}
3130
Alexander Belopolsky40018472011-02-26 01:02:56 +00003131PyObject *
3132PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003133 const char *encoding,
3134 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003135{
3136 PyObject *v;
3137
3138 if (!PyUnicode_Check(unicode)) {
3139 PyErr_BadArgument();
3140 goto onError;
3141 }
3142
3143 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003144 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003145
3146 /* Decode via the codec registry */
3147 v = PyCodec_Decode(unicode, encoding, errors);
3148 if (v == NULL)
3149 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003150 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003151
Benjamin Peterson29060642009-01-31 22:14:21 +00003152 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003153 return NULL;
3154}
3155
Alexander Belopolsky40018472011-02-26 01:02:56 +00003156PyObject *
3157PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003158 const char *encoding,
3159 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003160{
3161 PyObject *v;
3162
3163 if (!PyUnicode_Check(unicode)) {
3164 PyErr_BadArgument();
3165 goto onError;
3166 }
3167
3168 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003169 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003170
3171 /* Decode via the codec registry */
3172 v = PyCodec_Decode(unicode, encoding, errors);
3173 if (v == NULL)
3174 goto onError;
3175 if (!PyUnicode_Check(v)) {
3176 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003177 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003178 Py_TYPE(v)->tp_name);
3179 Py_DECREF(v);
3180 goto onError;
3181 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003182 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003183
Benjamin Peterson29060642009-01-31 22:14:21 +00003184 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003185 return NULL;
3186}
3187
Alexander Belopolsky40018472011-02-26 01:02:56 +00003188PyObject *
3189PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003190 Py_ssize_t size,
3191 const char *encoding,
3192 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003193{
3194 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003195
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196 unicode = PyUnicode_FromUnicode(s, size);
3197 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003198 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3200 Py_DECREF(unicode);
3201 return v;
3202}
3203
Alexander Belopolsky40018472011-02-26 01:02:56 +00003204PyObject *
3205PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003206 const char *encoding,
3207 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003208{
3209 PyObject *v;
3210
3211 if (!PyUnicode_Check(unicode)) {
3212 PyErr_BadArgument();
3213 goto onError;
3214 }
3215
3216 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003217 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003218
3219 /* Encode via the codec registry */
3220 v = PyCodec_Encode(unicode, encoding, errors);
3221 if (v == NULL)
3222 goto onError;
3223 return v;
3224
Benjamin Peterson29060642009-01-31 22:14:21 +00003225 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003226 return NULL;
3227}
3228
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003229static size_t
3230wcstombs_errorpos(const wchar_t *wstr)
3231{
3232 size_t len;
3233#if SIZEOF_WCHAR_T == 2
3234 wchar_t buf[3];
3235#else
3236 wchar_t buf[2];
3237#endif
3238 char outbuf[MB_LEN_MAX];
3239 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003240
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003241#if SIZEOF_WCHAR_T == 2
3242 buf[2] = 0;
3243#else
3244 buf[1] = 0;
3245#endif
3246 start = wstr;
3247 while (*wstr != L'\0')
3248 {
3249 previous = wstr;
3250#if SIZEOF_WCHAR_T == 2
3251 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3252 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3253 {
3254 buf[0] = wstr[0];
3255 buf[1] = wstr[1];
3256 wstr += 2;
3257 }
3258 else {
3259 buf[0] = *wstr;
3260 buf[1] = 0;
3261 wstr++;
3262 }
3263#else
3264 buf[0] = *wstr;
3265 wstr++;
3266#endif
3267 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003268 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003269 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003270 }
3271
3272 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003273 return 0;
3274}
3275
Victor Stinner1b579672011-12-17 05:47:23 +01003276static int
3277locale_error_handler(const char *errors, int *surrogateescape)
3278{
3279 if (errors == NULL) {
3280 *surrogateescape = 0;
3281 return 0;
3282 }
3283
3284 if (strcmp(errors, "strict") == 0) {
3285 *surrogateescape = 0;
3286 return 0;
3287 }
3288 if (strcmp(errors, "surrogateescape") == 0) {
3289 *surrogateescape = 1;
3290 return 0;
3291 }
3292 PyErr_Format(PyExc_ValueError,
3293 "only 'strict' and 'surrogateescape' error handlers "
3294 "are supported, not '%s'",
3295 errors);
3296 return -1;
3297}
3298
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003299PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003300PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003301{
3302 Py_ssize_t wlen, wlen2;
3303 wchar_t *wstr;
3304 PyObject *bytes = NULL;
3305 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003306 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003307 PyObject *exc;
3308 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003309 int surrogateescape;
3310
3311 if (locale_error_handler(errors, &surrogateescape) < 0)
3312 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003313
3314 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3315 if (wstr == NULL)
3316 return NULL;
3317
3318 wlen2 = wcslen(wstr);
3319 if (wlen2 != wlen) {
3320 PyMem_Free(wstr);
3321 PyErr_SetString(PyExc_TypeError, "embedded null character");
3322 return NULL;
3323 }
3324
3325 if (surrogateescape) {
3326 /* locale encoding with surrogateescape */
3327 char *str;
3328
3329 str = _Py_wchar2char(wstr, &error_pos);
3330 if (str == NULL) {
3331 if (error_pos == (size_t)-1) {
3332 PyErr_NoMemory();
3333 PyMem_Free(wstr);
3334 return NULL;
3335 }
3336 else {
3337 goto encode_error;
3338 }
3339 }
3340 PyMem_Free(wstr);
3341
3342 bytes = PyBytes_FromString(str);
3343 PyMem_Free(str);
3344 }
3345 else {
3346 size_t len, len2;
3347
3348 len = wcstombs(NULL, wstr, 0);
3349 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003350 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003351 goto encode_error;
3352 }
3353
3354 bytes = PyBytes_FromStringAndSize(NULL, len);
3355 if (bytes == NULL) {
3356 PyMem_Free(wstr);
3357 return NULL;
3358 }
3359
3360 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3361 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003362 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003363 goto encode_error;
3364 }
3365 PyMem_Free(wstr);
3366 }
3367 return bytes;
3368
3369encode_error:
3370 errmsg = strerror(errno);
3371 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003372
3373 if (error_pos == (size_t)-1)
3374 error_pos = wcstombs_errorpos(wstr);
3375
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003376 PyMem_Free(wstr);
3377 Py_XDECREF(bytes);
3378
Victor Stinner2f197072011-12-17 07:08:30 +01003379 if (errmsg != NULL) {
3380 size_t errlen;
3381 wstr = _Py_char2wchar(errmsg, &errlen);
3382 if (wstr != NULL) {
3383 reason = PyUnicode_FromWideChar(wstr, errlen);
3384 PyMem_Free(wstr);
3385 } else
3386 errmsg = NULL;
3387 }
3388 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003389 reason = PyUnicode_FromString(
3390 "wcstombs() encountered an unencodable "
3391 "wide character");
3392 if (reason == NULL)
3393 return NULL;
3394
3395 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3396 "locale", unicode,
3397 (Py_ssize_t)error_pos,
3398 (Py_ssize_t)(error_pos+1),
3399 reason);
3400 Py_DECREF(reason);
3401 if (exc != NULL) {
3402 PyCodec_StrictErrors(exc);
3403 Py_XDECREF(exc);
3404 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003405 return NULL;
3406}
3407
Victor Stinnerad158722010-10-27 00:25:46 +00003408PyObject *
3409PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003410{
Victor Stinner99b95382011-07-04 14:23:54 +02003411#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003412 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003413#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003414 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003415#else
Victor Stinner793b5312011-04-27 00:24:21 +02003416 PyInterpreterState *interp = PyThreadState_GET()->interp;
3417 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3418 cannot use it to encode and decode filenames before it is loaded. Load
3419 the Python codec requires to encode at least its own filename. Use the C
3420 version of the locale codec until the codec registry is initialized and
3421 the Python codec is loaded.
3422
3423 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3424 cannot only rely on it: check also interp->fscodec_initialized for
3425 subinterpreters. */
3426 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003427 return PyUnicode_AsEncodedString(unicode,
3428 Py_FileSystemDefaultEncoding,
3429 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003430 }
3431 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003432 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003433 }
Victor Stinnerad158722010-10-27 00:25:46 +00003434#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003435}
3436
Alexander Belopolsky40018472011-02-26 01:02:56 +00003437PyObject *
3438PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003439 const char *encoding,
3440 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441{
3442 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003443 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003444
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445 if (!PyUnicode_Check(unicode)) {
3446 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003447 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448 }
Fred Drakee4315f52000-05-09 19:53:39 +00003449
Fred Drakee4315f52000-05-09 19:53:39 +00003450 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003451 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003452 if ((strcmp(lower, "utf-8") == 0) ||
3453 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003454 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003455 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003456 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003457 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003458 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003459 }
Victor Stinner37296e82010-06-10 13:36:23 +00003460 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003461 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003462 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003463 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003464#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003465 else if (strcmp(lower, "mbcs") == 0)
3466 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003467#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003468 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003469 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003470 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003471
3472 /* Encode via the codec registry */
3473 v = PyCodec_Encode(unicode, encoding, errors);
3474 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003475 return NULL;
3476
3477 /* The normal path */
3478 if (PyBytes_Check(v))
3479 return v;
3480
3481 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003482 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003483 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003484 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003485
3486 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3487 "encoder %s returned bytearray instead of bytes",
3488 encoding);
3489 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003490 Py_DECREF(v);
3491 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003492 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003493
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003494 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3495 Py_DECREF(v);
3496 return b;
3497 }
3498
3499 PyErr_Format(PyExc_TypeError,
3500 "encoder did not return a bytes object (type=%.400s)",
3501 Py_TYPE(v)->tp_name);
3502 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003503 return NULL;
3504}
3505
Alexander Belopolsky40018472011-02-26 01:02:56 +00003506PyObject *
3507PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003508 const char *encoding,
3509 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003510{
3511 PyObject *v;
3512
3513 if (!PyUnicode_Check(unicode)) {
3514 PyErr_BadArgument();
3515 goto onError;
3516 }
3517
3518 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003519 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003520
3521 /* Encode via the codec registry */
3522 v = PyCodec_Encode(unicode, encoding, errors);
3523 if (v == NULL)
3524 goto onError;
3525 if (!PyUnicode_Check(v)) {
3526 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003527 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003528 Py_TYPE(v)->tp_name);
3529 Py_DECREF(v);
3530 goto onError;
3531 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003533
Benjamin Peterson29060642009-01-31 22:14:21 +00003534 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535 return NULL;
3536}
3537
Victor Stinner2f197072011-12-17 07:08:30 +01003538static size_t
3539mbstowcs_errorpos(const char *str, size_t len)
3540{
3541#ifdef HAVE_MBRTOWC
3542 const char *start = str;
3543 mbstate_t mbs;
3544 size_t converted;
3545 wchar_t ch;
3546
3547 memset(&mbs, 0, sizeof mbs);
3548 while (len)
3549 {
3550 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3551 if (converted == 0)
3552 /* Reached end of string */
3553 break;
3554 if (converted == (size_t)-1 || converted == (size_t)-2) {
3555 /* Conversion error or incomplete character */
3556 return str - start;
3557 }
3558 else {
3559 str += converted;
3560 len -= converted;
3561 }
3562 }
3563 /* failed to find the undecodable byte sequence */
3564 return 0;
3565#endif
3566 return 0;
3567}
3568
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003569PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003570PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003571 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003572{
3573 wchar_t smallbuf[256];
3574 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3575 wchar_t *wstr;
3576 size_t wlen, wlen2;
3577 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003578 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003579 size_t error_pos;
3580 char *errmsg;
3581 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003582
3583 if (locale_error_handler(errors, &surrogateescape) < 0)
3584 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003585
3586 if (str[len] != '\0' || len != strlen(str)) {
3587 PyErr_SetString(PyExc_TypeError, "embedded null character");
3588 return NULL;
3589 }
3590
3591 if (surrogateescape)
3592 {
3593 wstr = _Py_char2wchar(str, &wlen);
3594 if (wstr == NULL) {
3595 if (wlen == (size_t)-1)
3596 PyErr_NoMemory();
3597 else
3598 PyErr_SetFromErrno(PyExc_OSError);
3599 return NULL;
3600 }
3601
3602 unicode = PyUnicode_FromWideChar(wstr, wlen);
3603 PyMem_Free(wstr);
3604 }
3605 else {
3606#ifndef HAVE_BROKEN_MBSTOWCS
3607 wlen = mbstowcs(NULL, str, 0);
3608#else
3609 wlen = len;
3610#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003611 if (wlen == (size_t)-1)
3612 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003613 if (wlen+1 <= smallbuf_len) {
3614 wstr = smallbuf;
3615 }
3616 else {
3617 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3618 return PyErr_NoMemory();
3619
3620 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3621 if (!wstr)
3622 return PyErr_NoMemory();
3623 }
3624
3625 /* This shouldn't fail now */
3626 wlen2 = mbstowcs(wstr, str, wlen+1);
3627 if (wlen2 == (size_t)-1) {
3628 if (wstr != smallbuf)
3629 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003630 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003631 }
3632#ifdef HAVE_BROKEN_MBSTOWCS
3633 assert(wlen2 == wlen);
3634#endif
3635 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3636 if (wstr != smallbuf)
3637 PyMem_Free(wstr);
3638 }
3639 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003640
3641decode_error:
3642 errmsg = strerror(errno);
3643 assert(errmsg != NULL);
3644
3645 error_pos = mbstowcs_errorpos(str, len);
3646 if (errmsg != NULL) {
3647 size_t errlen;
3648 wstr = _Py_char2wchar(errmsg, &errlen);
3649 if (wstr != NULL) {
3650 reason = PyUnicode_FromWideChar(wstr, errlen);
3651 PyMem_Free(wstr);
3652 } else
3653 errmsg = NULL;
3654 }
3655 if (errmsg == NULL)
3656 reason = PyUnicode_FromString(
3657 "mbstowcs() encountered an invalid multibyte sequence");
3658 if (reason == NULL)
3659 return NULL;
3660
3661 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3662 "locale", str, len,
3663 (Py_ssize_t)error_pos,
3664 (Py_ssize_t)(error_pos+1),
3665 reason);
3666 Py_DECREF(reason);
3667 if (exc != NULL) {
3668 PyCodec_StrictErrors(exc);
3669 Py_XDECREF(exc);
3670 }
3671 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003672}
3673
3674PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003675PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003676{
3677 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003678 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003679}
3680
3681
3682PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003683PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003684 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003685 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3686}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003687
Christian Heimes5894ba72007-11-04 11:43:14 +00003688PyObject*
3689PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3690{
Victor Stinner99b95382011-07-04 14:23:54 +02003691#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003692 return PyUnicode_DecodeMBCS(s, size, NULL);
3693#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003694 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003695#else
Victor Stinner793b5312011-04-27 00:24:21 +02003696 PyInterpreterState *interp = PyThreadState_GET()->interp;
3697 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3698 cannot use it to encode and decode filenames before it is loaded. Load
3699 the Python codec requires to encode at least its own filename. Use the C
3700 version of the locale codec until the codec registry is initialized and
3701 the Python codec is loaded.
3702
3703 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3704 cannot only rely on it: check also interp->fscodec_initialized for
3705 subinterpreters. */
3706 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003707 return PyUnicode_Decode(s, size,
3708 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003709 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003710 }
3711 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003712 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003713 }
Victor Stinnerad158722010-10-27 00:25:46 +00003714#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003715}
3716
Martin v. Löwis011e8422009-05-05 04:43:17 +00003717
3718int
Antoine Pitrou13348842012-01-29 18:36:34 +01003719_PyUnicode_HasNULChars(PyObject* s)
3720{
3721 static PyObject *nul = NULL;
3722
3723 if (nul == NULL)
3724 nul = PyUnicode_FromStringAndSize("\0", 1);
3725 if (nul == NULL)
3726 return -1;
3727 return PyUnicode_Contains(s, nul);
3728}
3729
3730
3731int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003732PyUnicode_FSConverter(PyObject* arg, void* addr)
3733{
3734 PyObject *output = NULL;
3735 Py_ssize_t size;
3736 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003737 if (arg == NULL) {
3738 Py_DECREF(*(PyObject**)addr);
3739 return 1;
3740 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003741 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003742 output = arg;
3743 Py_INCREF(output);
3744 }
3745 else {
3746 arg = PyUnicode_FromObject(arg);
3747 if (!arg)
3748 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003749 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003750 Py_DECREF(arg);
3751 if (!output)
3752 return 0;
3753 if (!PyBytes_Check(output)) {
3754 Py_DECREF(output);
3755 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3756 return 0;
3757 }
3758 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003759 size = PyBytes_GET_SIZE(output);
3760 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003761 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003762 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003763 Py_DECREF(output);
3764 return 0;
3765 }
3766 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003767 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003768}
3769
3770
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003771int
3772PyUnicode_FSDecoder(PyObject* arg, void* addr)
3773{
3774 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003775 if (arg == NULL) {
3776 Py_DECREF(*(PyObject**)addr);
3777 return 1;
3778 }
3779 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003780 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003781 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003782 output = arg;
3783 Py_INCREF(output);
3784 }
3785 else {
3786 arg = PyBytes_FromObject(arg);
3787 if (!arg)
3788 return 0;
3789 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3790 PyBytes_GET_SIZE(arg));
3791 Py_DECREF(arg);
3792 if (!output)
3793 return 0;
3794 if (!PyUnicode_Check(output)) {
3795 Py_DECREF(output);
3796 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3797 return 0;
3798 }
3799 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003800 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003801 Py_DECREF(output);
3802 return 0;
3803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003804 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003805 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003806 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3807 Py_DECREF(output);
3808 return 0;
3809 }
3810 *(PyObject**)addr = output;
3811 return Py_CLEANUP_SUPPORTED;
3812}
3813
3814
Martin v. Löwis5b222132007-06-10 09:51:05 +00003815char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003816PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003817{
Christian Heimesf3863112007-11-22 07:46:41 +00003818 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003819
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003820 if (!PyUnicode_Check(unicode)) {
3821 PyErr_BadArgument();
3822 return NULL;
3823 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003824 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003825 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003826
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003827 if (PyUnicode_UTF8(unicode) == NULL) {
3828 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3830 if (bytes == NULL)
3831 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003832 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3833 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003834 Py_DECREF(bytes);
3835 return NULL;
3836 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003837 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3838 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3839 PyBytes_AS_STRING(bytes),
3840 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841 Py_DECREF(bytes);
3842 }
3843
3844 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003845 *psize = PyUnicode_UTF8_LENGTH(unicode);
3846 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003847}
3848
3849char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003850PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003851{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003852 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3853}
3854
3855#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003856static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857#endif
3858
3859
3860Py_UNICODE *
3861PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3862{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003863 const unsigned char *one_byte;
3864#if SIZEOF_WCHAR_T == 4
3865 const Py_UCS2 *two_bytes;
3866#else
3867 const Py_UCS4 *four_bytes;
3868 const Py_UCS4 *ucs4_end;
3869 Py_ssize_t num_surrogates;
3870#endif
3871 wchar_t *w;
3872 wchar_t *wchar_end;
3873
3874 if (!PyUnicode_Check(unicode)) {
3875 PyErr_BadArgument();
3876 return NULL;
3877 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003878 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003880 assert(_PyUnicode_KIND(unicode) != 0);
3881 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003882
3883#ifdef Py_DEBUG
3884 ++unicode_as_unicode_calls;
3885#endif
3886
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003887 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003888#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003889 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3890 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003891 num_surrogates = 0;
3892
3893 for (; four_bytes < ucs4_end; ++four_bytes) {
3894 if (*four_bytes > 0xFFFF)
3895 ++num_surrogates;
3896 }
3897
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003898 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3899 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3900 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003901 PyErr_NoMemory();
3902 return NULL;
3903 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003904 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003905
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003906 w = _PyUnicode_WSTR(unicode);
3907 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3908 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003909 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3910 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003911 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003912 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003913 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3914 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003915 }
3916 else
3917 *w = *four_bytes;
3918
3919 if (w > wchar_end) {
3920 assert(0 && "Miscalculated string end");
3921 }
3922 }
3923 *w = 0;
3924#else
3925 /* sizeof(wchar_t) == 4 */
3926 Py_FatalError("Impossible unicode object state, wstr and str "
3927 "should share memory already.");
3928 return NULL;
3929#endif
3930 }
3931 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003932 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3933 (_PyUnicode_LENGTH(unicode) + 1));
3934 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003935 PyErr_NoMemory();
3936 return NULL;
3937 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003938 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3939 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3940 w = _PyUnicode_WSTR(unicode);
3941 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003942
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003943 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3944 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003945 for (; w < wchar_end; ++one_byte, ++w)
3946 *w = *one_byte;
3947 /* null-terminate the wstr */
3948 *w = 0;
3949 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003950 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003951#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003952 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003953 for (; w < wchar_end; ++two_bytes, ++w)
3954 *w = *two_bytes;
3955 /* null-terminate the wstr */
3956 *w = 0;
3957#else
3958 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003959 PyObject_FREE(_PyUnicode_WSTR(unicode));
3960 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003961 Py_FatalError("Impossible unicode object state, wstr "
3962 "and str should share memory already.");
3963 return NULL;
3964#endif
3965 }
3966 else {
3967 assert(0 && "This should never happen.");
3968 }
3969 }
3970 }
3971 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003972 *size = PyUnicode_WSTR_LENGTH(unicode);
3973 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003974}
3975
Alexander Belopolsky40018472011-02-26 01:02:56 +00003976Py_UNICODE *
3977PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003979 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980}
3981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003982
Alexander Belopolsky40018472011-02-26 01:02:56 +00003983Py_ssize_t
3984PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985{
3986 if (!PyUnicode_Check(unicode)) {
3987 PyErr_BadArgument();
3988 goto onError;
3989 }
3990 return PyUnicode_GET_SIZE(unicode);
3991
Benjamin Peterson29060642009-01-31 22:14:21 +00003992 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993 return -1;
3994}
3995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996Py_ssize_t
3997PyUnicode_GetLength(PyObject *unicode)
3998{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003999 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004000 PyErr_BadArgument();
4001 return -1;
4002 }
4003
4004 return PyUnicode_GET_LENGTH(unicode);
4005}
4006
4007Py_UCS4
4008PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4009{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004010 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4011 PyErr_BadArgument();
4012 return (Py_UCS4)-1;
4013 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004014 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004015 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016 return (Py_UCS4)-1;
4017 }
4018 return PyUnicode_READ_CHAR(unicode, index);
4019}
4020
4021int
4022PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4023{
4024 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004025 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004026 return -1;
4027 }
Victor Stinner488fa492011-12-12 00:01:39 +01004028 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004029 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004030 PyErr_SetString(PyExc_IndexError, "string index out of range");
4031 return -1;
4032 }
Victor Stinner488fa492011-12-12 00:01:39 +01004033 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004034 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004035 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4036 PyErr_SetString(PyExc_ValueError, "character out of range");
4037 return -1;
4038 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004039 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4040 index, ch);
4041 return 0;
4042}
4043
Alexander Belopolsky40018472011-02-26 01:02:56 +00004044const char *
4045PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004046{
Victor Stinner42cb4622010-09-01 19:39:01 +00004047 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004048}
4049
Victor Stinner554f3f02010-06-16 23:33:54 +00004050/* create or adjust a UnicodeDecodeError */
4051static void
4052make_decode_exception(PyObject **exceptionObject,
4053 const char *encoding,
4054 const char *input, Py_ssize_t length,
4055 Py_ssize_t startpos, Py_ssize_t endpos,
4056 const char *reason)
4057{
4058 if (*exceptionObject == NULL) {
4059 *exceptionObject = PyUnicodeDecodeError_Create(
4060 encoding, input, length, startpos, endpos, reason);
4061 }
4062 else {
4063 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4064 goto onError;
4065 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4066 goto onError;
4067 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4068 goto onError;
4069 }
4070 return;
4071
4072onError:
4073 Py_DECREF(*exceptionObject);
4074 *exceptionObject = NULL;
4075}
4076
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004077/* error handling callback helper:
4078 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004079 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004080 and adjust various state variables.
4081 return 0 on success, -1 on error
4082*/
4083
Alexander Belopolsky40018472011-02-26 01:02:56 +00004084static int
4085unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004086 const char *encoding, const char *reason,
4087 const char **input, const char **inend, Py_ssize_t *startinpos,
4088 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004089 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004090{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004091 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004092
4093 PyObject *restuple = NULL;
4094 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004095 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004096 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004097 Py_ssize_t requiredsize;
4098 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004099 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004100 int res = -1;
4101
Victor Stinner596a6c42011-11-09 00:02:18 +01004102 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4103 outsize = PyUnicode_GET_LENGTH(*output);
4104 else
4105 outsize = _PyUnicode_WSTR_LENGTH(*output);
4106
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004107 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004108 *errorHandler = PyCodec_LookupError(errors);
4109 if (*errorHandler == NULL)
4110 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004111 }
4112
Victor Stinner554f3f02010-06-16 23:33:54 +00004113 make_decode_exception(exceptionObject,
4114 encoding,
4115 *input, *inend - *input,
4116 *startinpos, *endinpos,
4117 reason);
4118 if (*exceptionObject == NULL)
4119 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004120
4121 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4122 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004123 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004124 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004125 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004126 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127 }
4128 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004129 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004130 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004131 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004132
4133 /* Copy back the bytes variables, which might have been modified by the
4134 callback */
4135 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4136 if (!inputobj)
4137 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004138 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004139 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004140 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004141 *input = PyBytes_AS_STRING(inputobj);
4142 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004143 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004144 /* we can DECREF safely, as the exception has another reference,
4145 so the object won't go away. */
4146 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004147
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004149 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004150 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004151 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4152 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004153 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154
Victor Stinner596a6c42011-11-09 00:02:18 +01004155 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4156 /* need more space? (at least enough for what we
4157 have+the replacement+the rest of the string (starting
4158 at the new input position), so we won't have to check space
4159 when there are no errors in the rest of the string) */
4160 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4161 requiredsize = *outpos + replen + insize-newpos;
4162 if (requiredsize > outsize) {
4163 if (requiredsize<2*outsize)
4164 requiredsize = 2*outsize;
4165 if (unicode_resize(output, requiredsize) < 0)
4166 goto onError;
4167 }
4168 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01004170 copy_characters(*output, *outpos, repunicode, 0, replen);
4171 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004173 else {
4174 wchar_t *repwstr;
4175 Py_ssize_t repwlen;
4176 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4177 if (repwstr == NULL)
4178 goto onError;
4179 /* need more space? (at least enough for what we
4180 have+the replacement+the rest of the string (starting
4181 at the new input position), so we won't have to check space
4182 when there are no errors in the rest of the string) */
4183 requiredsize = *outpos + repwlen + insize-newpos;
4184 if (requiredsize > outsize) {
4185 if (requiredsize < 2*outsize)
4186 requiredsize = 2*outsize;
4187 if (unicode_resize(output, requiredsize) < 0)
4188 goto onError;
4189 }
4190 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4191 *outpos += repwlen;
4192 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004194 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004195
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 /* we made it! */
4197 res = 0;
4198
Benjamin Peterson29060642009-01-31 22:14:21 +00004199 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200 Py_XDECREF(restuple);
4201 return res;
4202}
4203
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004204/* --- UTF-7 Codec -------------------------------------------------------- */
4205
Antoine Pitrou244651a2009-05-04 18:56:13 +00004206/* See RFC2152 for details. We encode conservatively and decode liberally. */
4207
4208/* Three simple macros defining base-64. */
4209
4210/* Is c a base-64 character? */
4211
4212#define IS_BASE64(c) \
4213 (((c) >= 'A' && (c) <= 'Z') || \
4214 ((c) >= 'a' && (c) <= 'z') || \
4215 ((c) >= '0' && (c) <= '9') || \
4216 (c) == '+' || (c) == '/')
4217
4218/* given that c is a base-64 character, what is its base-64 value? */
4219
4220#define FROM_BASE64(c) \
4221 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4222 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4223 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4224 (c) == '+' ? 62 : 63)
4225
4226/* What is the base-64 character of the bottom 6 bits of n? */
4227
4228#define TO_BASE64(n) \
4229 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4230
4231/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4232 * decoded as itself. We are permissive on decoding; the only ASCII
4233 * byte not decoding to itself is the + which begins a base64
4234 * string. */
4235
4236#define DECODE_DIRECT(c) \
4237 ((c) <= 127 && (c) != '+')
4238
4239/* The UTF-7 encoder treats ASCII characters differently according to
4240 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4241 * the above). See RFC2152. This array identifies these different
4242 * sets:
4243 * 0 : "Set D"
4244 * alphanumeric and '(),-./:?
4245 * 1 : "Set O"
4246 * !"#$%&*;<=>@[]^_`{|}
4247 * 2 : "whitespace"
4248 * ht nl cr sp
4249 * 3 : special (must be base64 encoded)
4250 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4251 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004252
Tim Petersced69f82003-09-16 20:30:58 +00004253static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004254char utf7_category[128] = {
4255/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4256 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4257/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4258 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4259/* sp ! " # $ % & ' ( ) * + , - . / */
4260 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4261/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4262 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4263/* @ A B C D E F G H I J K L M N O */
4264 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4265/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4267/* ` a b c d e f g h i j k l m n o */
4268 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4269/* p q r s t u v w x y z { | } ~ del */
4270 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004271};
4272
Antoine Pitrou244651a2009-05-04 18:56:13 +00004273/* ENCODE_DIRECT: this character should be encoded as itself. The
4274 * answer depends on whether we are encoding set O as itself, and also
4275 * on whether we are encoding whitespace as itself. RFC2152 makes it
4276 * clear that the answers to these questions vary between
4277 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004278
Antoine Pitrou244651a2009-05-04 18:56:13 +00004279#define ENCODE_DIRECT(c, directO, directWS) \
4280 ((c) < 128 && (c) > 0 && \
4281 ((utf7_category[(c)] == 0) || \
4282 (directWS && (utf7_category[(c)] == 2)) || \
4283 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004284
Alexander Belopolsky40018472011-02-26 01:02:56 +00004285PyObject *
4286PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004287 Py_ssize_t size,
4288 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004289{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004290 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4291}
4292
Antoine Pitrou244651a2009-05-04 18:56:13 +00004293/* The decoder. The only state we preserve is our read position,
4294 * i.e. how many characters we have consumed. So if we end in the
4295 * middle of a shift sequence we have to back off the read position
4296 * and the output to the beginning of the sequence, otherwise we lose
4297 * all the shift state (seen bits, number of bits seen, high
4298 * surrogate). */
4299
Alexander Belopolsky40018472011-02-26 01:02:56 +00004300PyObject *
4301PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004302 Py_ssize_t size,
4303 const char *errors,
4304 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004305{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004306 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004307 Py_ssize_t startinpos;
4308 Py_ssize_t endinpos;
4309 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004310 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004311 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004312 const char *errmsg = "";
4313 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004314 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004315 unsigned int base64bits = 0;
4316 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004317 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004318 PyObject *errorHandler = NULL;
4319 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004320
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004321 /* Start off assuming it's all ASCII. Widen later as necessary. */
4322 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323 if (!unicode)
4324 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004325 if (size == 0) {
4326 if (consumed)
4327 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004328 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004329 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004330
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004331 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004332 e = s + size;
4333
4334 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004335 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004336 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004337 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004338
Antoine Pitrou244651a2009-05-04 18:56:13 +00004339 if (inShift) { /* in a base-64 section */
4340 if (IS_BASE64(ch)) { /* consume a base-64 character */
4341 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4342 base64bits += 6;
4343 s++;
4344 if (base64bits >= 16) {
4345 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004346 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004347 base64bits -= 16;
4348 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4349 if (surrogate) {
4350 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004351 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4352 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004353 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4354 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004355 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004356 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004357 }
4358 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004359 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4360 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004361 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004362 }
4363 }
Victor Stinner551ac952011-11-29 22:58:13 +01004364 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365 /* first surrogate */
4366 surrogate = outCh;
4367 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004369 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4370 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004371 }
4372 }
4373 }
4374 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004375 inShift = 0;
4376 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004377 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004378 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4379 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004380 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004381 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004382 if (base64bits > 0) { /* left-over bits */
4383 if (base64bits >= 6) {
4384 /* We've seen at least one base-64 character */
4385 errmsg = "partial character in shift sequence";
4386 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004387 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004388 else {
4389 /* Some bits remain; they should be zero */
4390 if (base64buffer != 0) {
4391 errmsg = "non-zero padding bits in shift sequence";
4392 goto utf7Error;
4393 }
4394 }
4395 }
4396 if (ch != '-') {
4397 /* '-' is absorbed; other terminating
4398 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004399 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4400 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004402 }
4403 }
4404 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004406 s++; /* consume '+' */
4407 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004408 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004409 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4410 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004411 }
4412 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004413 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004414 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416 }
4417 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004419 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4420 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004421 s++;
4422 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423 else {
4424 startinpos = s-starts;
4425 s++;
4426 errmsg = "unexpected special character";
4427 goto utf7Error;
4428 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004429 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 endinpos = s-starts;
4432 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004433 errors, &errorHandler,
4434 "utf7", errmsg,
4435 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004436 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004437 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438 }
4439
Antoine Pitrou244651a2009-05-04 18:56:13 +00004440 /* end of string */
4441
4442 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4443 /* if we're in an inconsistent state, that's an error */
4444 if (surrogate ||
4445 (base64bits >= 6) ||
4446 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 endinpos = size;
4448 if (unicode_decode_call_errorhandler(
4449 errors, &errorHandler,
4450 "utf7", "unterminated shift sequence",
4451 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004452 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004453 goto onError;
4454 if (s < e)
4455 goto restart;
4456 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004457 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458
4459 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004460 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004462 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004463 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004464 }
4465 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004466 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004468 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004469
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004470 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004471 goto onError;
4472
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004473 Py_XDECREF(errorHandler);
4474 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004475 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004476
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 Py_XDECREF(errorHandler);
4479 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480 Py_DECREF(unicode);
4481 return NULL;
4482}
4483
4484
Alexander Belopolsky40018472011-02-26 01:02:56 +00004485PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004486_PyUnicode_EncodeUTF7(PyObject *str,
4487 int base64SetO,
4488 int base64WhiteSpace,
4489 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004490{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004491 int kind;
4492 void *data;
4493 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004494 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004495 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004496 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004497 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004498 unsigned int base64bits = 0;
4499 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004500 char * out;
4501 char * start;
4502
Benjamin Petersonbac79492012-01-14 13:34:47 -05004503 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004504 return NULL;
4505 kind = PyUnicode_KIND(str);
4506 data = PyUnicode_DATA(str);
4507 len = PyUnicode_GET_LENGTH(str);
4508
4509 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004510 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004511
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004512 /* It might be possible to tighten this worst case */
4513 allocated = 8 * len;
4514 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004515 return PyErr_NoMemory();
4516
Antoine Pitrou244651a2009-05-04 18:56:13 +00004517 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004518 if (v == NULL)
4519 return NULL;
4520
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004521 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004522 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004523 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004524
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525 if (inShift) {
4526 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4527 /* shifting out */
4528 if (base64bits) { /* output remaining bits */
4529 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4530 base64buffer = 0;
4531 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 }
4533 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004534 /* Characters not in the BASE64 set implicitly unshift the sequence
4535 so no '-' is required, except if the character is itself a '-' */
4536 if (IS_BASE64(ch) || ch == '-') {
4537 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539 *out++ = (char) ch;
4540 }
4541 else {
4542 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004543 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 else { /* not in a shift sequence */
4546 if (ch == '+') {
4547 *out++ = '+';
4548 *out++ = '-';
4549 }
4550 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4551 *out++ = (char) ch;
4552 }
4553 else {
4554 *out++ = '+';
4555 inShift = 1;
4556 goto encode_char;
4557 }
4558 }
4559 continue;
4560encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004562 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004563
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 /* code first surrogate */
4565 base64bits += 16;
4566 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4567 while (base64bits >= 6) {
4568 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4569 base64bits -= 6;
4570 }
4571 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004572 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 base64bits += 16;
4575 base64buffer = (base64buffer << 16) | ch;
4576 while (base64bits >= 6) {
4577 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4578 base64bits -= 6;
4579 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004580 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 if (base64bits)
4582 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4583 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004585 if (_PyBytes_Resize(&v, out - start) < 0)
4586 return NULL;
4587 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004588}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004589PyObject *
4590PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4591 Py_ssize_t size,
4592 int base64SetO,
4593 int base64WhiteSpace,
4594 const char *errors)
4595{
4596 PyObject *result;
4597 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4598 if (tmp == NULL)
4599 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004600 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004601 base64WhiteSpace, errors);
4602 Py_DECREF(tmp);
4603 return result;
4604}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004605
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606#undef IS_BASE64
4607#undef FROM_BASE64
4608#undef TO_BASE64
4609#undef DECODE_DIRECT
4610#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004611
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612/* --- UTF-8 Codec -------------------------------------------------------- */
4613
Tim Petersced69f82003-09-16 20:30:58 +00004614static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004616 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4617 illegal prefix. See RFC 3629 for details */
4618 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4619 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004620 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004621 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4622 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4623 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4624 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004625 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4626 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4628 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004629 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4630 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4631 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4632 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4633 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634};
4635
Alexander Belopolsky40018472011-02-26 01:02:56 +00004636PyObject *
4637PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004638 Py_ssize_t size,
4639 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004640{
Walter Dörwald69652032004-09-07 20:24:22 +00004641 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4642}
4643
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004644#include "stringlib/ucs1lib.h"
4645#include "stringlib/codecs.h"
4646#include "stringlib/undef.h"
4647
4648#include "stringlib/ucs2lib.h"
4649#include "stringlib/codecs.h"
4650#include "stringlib/undef.h"
4651
4652#include "stringlib/ucs4lib.h"
4653#include "stringlib/codecs.h"
4654#include "stringlib/undef.h"
4655
Antoine Pitrouab868312009-01-10 15:40:25 +00004656/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4657#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4658
4659/* Mask to quickly check whether a C 'long' contains a
4660 non-ASCII, UTF8-encoded char. */
4661#if (SIZEOF_LONG == 8)
4662# define ASCII_CHAR_MASK 0x8080808080808080L
4663#elif (SIZEOF_LONG == 4)
4664# define ASCII_CHAR_MASK 0x80808080L
4665#else
4666# error C 'long' size should be either 4 or 8!
4667#endif
4668
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004669/* Scans a UTF-8 string and returns the maximum character to be expected
4670 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004671
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004672 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004673 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004674 */
4675static Py_UCS4
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004676utf8_scanner(const unsigned char *p, Py_ssize_t string_size, Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004677{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004678 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004679 const unsigned char *end = p + string_size;
4680 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004681
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004682 assert(unicode_size != NULL);
4683
4684 /* By having a cascade of independent loops which fallback onto each
4685 other, we minimize the amount of work done in the average loop
4686 iteration, and we also maximize the CPU's ability to predict
4687 branches correctly (because a given condition will have always the
4688 same boolean outcome except perhaps in the last iteration of the
4689 corresponding loop).
4690 In the general case this brings us rather close to decoding
4691 performance pre-PEP 393, despite the two-pass decoding.
4692
4693 Note that the pure ASCII loop is not duplicated once a non-ASCII
4694 character has been encountered. It is actually a pessimization (by
4695 a significant factor) to use this loop on text with many non-ASCII
4696 characters, and it is important to avoid bad performance on valid
4697 utf-8 data (invalid utf-8 being a different can of worms).
4698 */
4699
4700 /* ASCII */
4701 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004702 /* Only check value if it's not a ASCII char... */
4703 if (*p < 0x80) {
4704 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4705 an explanation. */
4706 if (!((size_t) p & LONG_PTR_MASK)) {
4707 /* Help register allocation */
4708 register const unsigned char *_p = p;
4709 while (_p < aligned_end) {
4710 unsigned long value = *(unsigned long *) _p;
4711 if (value & ASCII_CHAR_MASK)
4712 break;
4713 _p += SIZEOF_LONG;
4714 char_count += SIZEOF_LONG;
4715 }
4716 p = _p;
4717 if (p == end)
4718 break;
4719 }
4720 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004721 if (*p < 0x80)
4722 ++char_count;
4723 else
4724 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004725 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004726 *unicode_size = char_count;
4727 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004728
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004729_ucs1loop:
4730 for (; p < end; ++p) {
4731 if (*p < 0xc4)
4732 char_count += ((*p & 0xc0) != 0x80);
4733 else
4734 goto _ucs2loop;
4735 }
4736 *unicode_size = char_count;
4737 return 255;
4738
4739_ucs2loop:
4740 for (; p < end; ++p) {
4741 if (*p < 0xf0)
4742 char_count += ((*p & 0xc0) != 0x80);
4743 else
4744 goto _ucs4loop;
4745 }
4746 *unicode_size = char_count;
4747 return 65535;
4748
4749_ucs4loop:
4750 for (; p < end; ++p) {
4751 char_count += ((*p & 0xc0) != 0x80);
4752 }
4753 *unicode_size = char_count;
4754 return 65537;
4755}
4756
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004757/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004758 in case of errors. Implicit parameters: unicode, kind, data, onError.
4759 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004760*/
Victor Stinner785938e2011-12-11 20:09:03 +01004761#define WRITE_MAYBE_FAIL(index, value) \
4762 do { \
4763 Py_ssize_t pos = index; \
4764 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4765 unicode_resize(&unicode, pos + pos/8) < 0) \
4766 goto onError; \
4767 if (unicode_putchar(&unicode, &pos, value) < 0) \
4768 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004769 } while (0)
4770
Victor Stinnerbf6e5602011-12-12 01:53:47 +01004771static PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004772decode_utf8_errors(const char *starts,
4773 Py_ssize_t size,
4774 const char *errors,
4775 Py_ssize_t *consumed,
4776 const char *s,
4777 PyObject *unicode,
4778 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004779{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004781 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004782 Py_ssize_t startinpos;
4783 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004784 const char *e = starts + size;
4785 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004786 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004787 PyObject *errorHandler = NULL;
4788 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004789
Antoine Pitrouab868312009-01-10 15:40:25 +00004790 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004791
4792 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004793 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794
4795 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004796 /* Fast path for runs of ASCII characters. Given that common UTF-8
4797 input will consist of an overwhelming majority of ASCII
4798 characters, we try to optimize for this case by checking
4799 as many characters as a C 'long' can contain.
4800 First, check if we can do an aligned read, as most CPUs have
4801 a penalty for unaligned reads.
4802 */
4803 if (!((size_t) s & LONG_PTR_MASK)) {
4804 /* Help register allocation */
4805 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004806 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004807 while (_s < aligned_end) {
4808 /* Read a whole long at a time (either 4 or 8 bytes),
4809 and do a fast unrolled copy if it only contains ASCII
4810 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004811 unsigned long value = *(unsigned long *) _s;
4812 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004813 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004814 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4815 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4816 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4817 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004818#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004819 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4820 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4821 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4822 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004823#endif
4824 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004825 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004826 }
4827 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004828 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004829 if (s == e)
4830 break;
4831 ch = (unsigned char)*s;
4832 }
4833 }
4834
4835 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004836 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837 s++;
4838 continue;
4839 }
4840
4841 n = utf8_code_length[ch];
4842
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004843 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 if (consumed)
4845 break;
4846 else {
4847 errmsg = "unexpected end of data";
4848 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004849 endinpos = startinpos+1;
4850 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4851 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004852 goto utf8Error;
4853 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004854 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855
4856 switch (n) {
4857
4858 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004859 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004860 startinpos = s-starts;
4861 endinpos = startinpos+1;
4862 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863
4864 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004865 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004866 startinpos = s-starts;
4867 endinpos = startinpos+1;
4868 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869
4870 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004871 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004872 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004873 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004874 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004875 goto utf8Error;
4876 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004878 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004879 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880 break;
4881
4882 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004883 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4884 will result in surrogates in range d800-dfff. Surrogates are
4885 not valid UTF-8 so they are rejected.
4886 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4887 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004888 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004889 (s[2] & 0xc0) != 0x80 ||
4890 ((unsigned char)s[0] == 0xE0 &&
4891 (unsigned char)s[1] < 0xA0) ||
4892 ((unsigned char)s[0] == 0xED &&
4893 (unsigned char)s[1] > 0x9F)) {
4894 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004895 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004896 endinpos = startinpos + 1;
4897
4898 /* if s[1] first two bits are 1 and 0, then the invalid
4899 continuation byte is s[2], so increment endinpos by 1,
4900 if not, s[1] is invalid and endinpos doesn't need to
4901 be incremented. */
4902 if ((s[1] & 0xC0) == 0x80)
4903 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004904 goto utf8Error;
4905 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004906 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004907 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004908 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004909 break;
4910
4911 case 4:
4912 if ((s[1] & 0xc0) != 0x80 ||
4913 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004914 (s[3] & 0xc0) != 0x80 ||
4915 ((unsigned char)s[0] == 0xF0 &&
4916 (unsigned char)s[1] < 0x90) ||
4917 ((unsigned char)s[0] == 0xF4 &&
4918 (unsigned char)s[1] > 0x8F)) {
4919 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004920 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004921 endinpos = startinpos + 1;
4922 if ((s[1] & 0xC0) == 0x80) {
4923 endinpos++;
4924 if ((s[2] & 0xC0) == 0x80)
4925 endinpos++;
4926 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004927 goto utf8Error;
4928 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004929 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004930 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004931 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004932
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004933 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 }
4936 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004938
Benjamin Peterson29060642009-01-31 22:14:21 +00004939 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 if (unicode_decode_call_errorhandler(
4941 errors, &errorHandler,
Victor Stinnercbe01342012-02-14 01:17:45 +01004942 "utf-8", errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00004943 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004944 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004945 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004946 /* Update data because unicode_decode_call_errorhandler might have
4947 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004948 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004949 }
Walter Dörwald69652032004-09-07 20:24:22 +00004950 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004951 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004953 /* Adjust length and ready string when it contained errors and
4954 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004955 if (unicode_resize(&unicode, i) < 0)
4956 goto onError;
4957 unicode_adjust_maxchar(&unicode);
4958 if (unicode == NULL)
4959 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004961 Py_XDECREF(errorHandler);
4962 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004963 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004964 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965
Benjamin Peterson29060642009-01-31 22:14:21 +00004966 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004967 Py_XDECREF(errorHandler);
4968 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004969 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970 return NULL;
4971}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004972#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004973
Victor Stinner785938e2011-12-11 20:09:03 +01004974PyObject *
4975PyUnicode_DecodeUTF8Stateful(const char *s,
4976 Py_ssize_t size,
4977 const char *errors,
4978 Py_ssize_t *consumed)
4979{
4980 Py_UCS4 maxchar = 0;
4981 Py_ssize_t unicode_size;
4982 int has_errors = 0;
4983 PyObject *unicode;
4984 int kind;
4985 void *data;
4986 const char *starts = s;
4987 const char *e;
4988 Py_ssize_t i;
4989
4990 if (size == 0) {
4991 if (consumed)
4992 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004993 Py_INCREF(unicode_empty);
4994 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004995 }
4996
Victor Stinnera1d12bb2011-12-11 21:53:09 +01004997 maxchar = utf8_scanner((const unsigned char *)s, size, &unicode_size);
Victor Stinner785938e2011-12-11 20:09:03 +01004998
4999 /* When the string is ASCII only, just use memcpy and return.
5000 unicode_size may be != size if there is an incomplete UTF-8
5001 sequence at the end of the ASCII block. */
5002 if (maxchar < 128 && size == unicode_size) {
5003 if (consumed)
5004 *consumed = size;
Victor Stinnerab870212011-12-17 22:39:43 +01005005 return unicode_fromascii((const unsigned char *)s, size);
Victor Stinner785938e2011-12-11 20:09:03 +01005006 }
5007
5008 unicode = PyUnicode_New(unicode_size, maxchar);
5009 if (!unicode)
5010 return NULL;
5011 kind = PyUnicode_KIND(unicode);
5012 data = PyUnicode_DATA(unicode);
5013
5014 /* Unpack UTF-8 encoded data */
5015 i = 0;
5016 e = starts + size;
5017 switch (kind) {
5018 case PyUnicode_1BYTE_KIND:
5019 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
5020 break;
5021 case PyUnicode_2BYTE_KIND:
5022 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
5023 break;
5024 case PyUnicode_4BYTE_KIND:
5025 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
5026 break;
5027 }
5028 if (!has_errors) {
5029 /* Ensure the unicode size calculation was correct */
5030 assert(i == unicode_size);
5031 assert(s == e);
5032 if (consumed)
5033 *consumed = size;
5034 return unicode;
5035 }
5036
5037 /* In case of errors, maxchar and size computation might be incorrect;
5038 code below refits and resizes as necessary. */
5039 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
5040}
5041
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005042#ifdef __APPLE__
5043
5044/* Simplified UTF-8 decoder using surrogateescape error handler,
5045 used to decode the command line arguments on Mac OS X. */
5046
5047wchar_t*
5048_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
5049{
5050 int n;
5051 const char *e;
5052 wchar_t *unicode, *p;
5053
5054 /* Note: size will always be longer than the resulting Unicode
5055 character count */
5056 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
5057 PyErr_NoMemory();
5058 return NULL;
5059 }
5060 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
5061 if (!unicode)
5062 return NULL;
5063
5064 /* Unpack UTF-8 encoded data */
5065 p = unicode;
5066 e = s + size;
5067 while (s < e) {
5068 Py_UCS4 ch = (unsigned char)*s;
5069
5070 if (ch < 0x80) {
5071 *p++ = (wchar_t)ch;
5072 s++;
5073 continue;
5074 }
5075
5076 n = utf8_code_length[ch];
5077 if (s + n > e) {
5078 goto surrogateescape;
5079 }
5080
5081 switch (n) {
5082 case 0:
5083 case 1:
5084 goto surrogateescape;
5085
5086 case 2:
5087 if ((s[1] & 0xc0) != 0x80)
5088 goto surrogateescape;
5089 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
5090 assert ((ch > 0x007F) && (ch <= 0x07FF));
5091 *p++ = (wchar_t)ch;
5092 break;
5093
5094 case 3:
5095 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
5096 will result in surrogates in range d800-dfff. Surrogates are
5097 not valid UTF-8 so they are rejected.
5098 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
5099 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
5100 if ((s[1] & 0xc0) != 0x80 ||
5101 (s[2] & 0xc0) != 0x80 ||
5102 ((unsigned char)s[0] == 0xE0 &&
5103 (unsigned char)s[1] < 0xA0) ||
5104 ((unsigned char)s[0] == 0xED &&
5105 (unsigned char)s[1] > 0x9F)) {
5106
5107 goto surrogateescape;
5108 }
5109 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5110 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005111 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005112 break;
5113
5114 case 4:
5115 if ((s[1] & 0xc0) != 0x80 ||
5116 (s[2] & 0xc0) != 0x80 ||
5117 (s[3] & 0xc0) != 0x80 ||
5118 ((unsigned char)s[0] == 0xF0 &&
5119 (unsigned char)s[1] < 0x90) ||
5120 ((unsigned char)s[0] == 0xF4 &&
5121 (unsigned char)s[1] > 0x8F)) {
5122 goto surrogateescape;
5123 }
5124 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
5125 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01005126 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005127
5128#if SIZEOF_WCHAR_T == 4
5129 *p++ = (wchar_t)ch;
5130#else
5131 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01005132 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5133 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005134#endif
5135 break;
5136 }
5137 s += n;
5138 continue;
5139
5140 surrogateescape:
5141 *p++ = 0xDC00 + ch;
5142 s++;
5143 }
5144 *p = L'\0';
5145 return unicode;
5146}
5147
5148#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00005149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005150/* Primary internal function which creates utf8 encoded bytes objects.
5151
5152 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005153 and allocate exactly as much space needed at the end. Else allocate the
5154 maximum possible needed (4 result bytes per Unicode character), and return
5155 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005156*/
Tim Peters7e3d9612002-04-21 03:26:37 +00005157PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01005158_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159{
Victor Stinner6099a032011-12-18 14:22:26 +01005160 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005161 void *data;
5162 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005164 if (!PyUnicode_Check(unicode)) {
5165 PyErr_BadArgument();
5166 return NULL;
5167 }
5168
5169 if (PyUnicode_READY(unicode) == -1)
5170 return NULL;
5171
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005172 if (PyUnicode_UTF8(unicode))
5173 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5174 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005175
5176 kind = PyUnicode_KIND(unicode);
5177 data = PyUnicode_DATA(unicode);
5178 size = PyUnicode_GET_LENGTH(unicode);
5179
Benjamin Petersonead6b532011-12-20 17:23:42 -06005180 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005181 default:
5182 assert(0);
5183 case PyUnicode_1BYTE_KIND:
5184 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5185 assert(!PyUnicode_IS_ASCII(unicode));
5186 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5187 case PyUnicode_2BYTE_KIND:
5188 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5189 case PyUnicode_4BYTE_KIND:
5190 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005191 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192}
5193
Alexander Belopolsky40018472011-02-26 01:02:56 +00005194PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005195PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5196 Py_ssize_t size,
5197 const char *errors)
5198{
5199 PyObject *v, *unicode;
5200
5201 unicode = PyUnicode_FromUnicode(s, size);
5202 if (unicode == NULL)
5203 return NULL;
5204 v = _PyUnicode_AsUTF8String(unicode, errors);
5205 Py_DECREF(unicode);
5206 return v;
5207}
5208
5209PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005210PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005212 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213}
5214
Walter Dörwald41980ca2007-08-16 21:55:45 +00005215/* --- UTF-32 Codec ------------------------------------------------------- */
5216
5217PyObject *
5218PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005219 Py_ssize_t size,
5220 const char *errors,
5221 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005222{
5223 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5224}
5225
5226PyObject *
5227PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 Py_ssize_t size,
5229 const char *errors,
5230 int *byteorder,
5231 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005232{
5233 const char *starts = s;
5234 Py_ssize_t startinpos;
5235 Py_ssize_t endinpos;
5236 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005237 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005238 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005239 int bo = 0; /* assume native ordering by default */
5240 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005241 /* Offsets from q for retrieving bytes in the right order. */
5242#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5243 int iorder[] = {0, 1, 2, 3};
5244#else
5245 int iorder[] = {3, 2, 1, 0};
5246#endif
5247 PyObject *errorHandler = NULL;
5248 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005249
Walter Dörwald41980ca2007-08-16 21:55:45 +00005250 q = (unsigned char *)s;
5251 e = q + size;
5252
5253 if (byteorder)
5254 bo = *byteorder;
5255
5256 /* Check for BOM marks (U+FEFF) in the input and adjust current
5257 byte order setting accordingly. In native mode, the leading BOM
5258 mark is skipped, in all other modes, it is copied to the output
5259 stream as-is (giving a ZWNBSP character). */
5260 if (bo == 0) {
5261 if (size >= 4) {
5262 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005263 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005264#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005265 if (bom == 0x0000FEFF) {
5266 q += 4;
5267 bo = -1;
5268 }
5269 else if (bom == 0xFFFE0000) {
5270 q += 4;
5271 bo = 1;
5272 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005273#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005274 if (bom == 0x0000FEFF) {
5275 q += 4;
5276 bo = 1;
5277 }
5278 else if (bom == 0xFFFE0000) {
5279 q += 4;
5280 bo = -1;
5281 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005282#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005284 }
5285
5286 if (bo == -1) {
5287 /* force LE */
5288 iorder[0] = 0;
5289 iorder[1] = 1;
5290 iorder[2] = 2;
5291 iorder[3] = 3;
5292 }
5293 else if (bo == 1) {
5294 /* force BE */
5295 iorder[0] = 3;
5296 iorder[1] = 2;
5297 iorder[2] = 1;
5298 iorder[3] = 0;
5299 }
5300
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005301 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005302 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005303 if (!unicode)
5304 return NULL;
5305 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005306 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005307 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005308
Walter Dörwald41980ca2007-08-16 21:55:45 +00005309 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005310 Py_UCS4 ch;
5311 /* remaining bytes at the end? (size should be divisible by 4) */
5312 if (e-q<4) {
5313 if (consumed)
5314 break;
5315 errmsg = "truncated data";
5316 startinpos = ((const char *)q)-starts;
5317 endinpos = ((const char *)e)-starts;
5318 goto utf32Error;
5319 /* The remaining input chars are ignored if the callback
5320 chooses to skip the input */
5321 }
5322 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5323 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005324
Benjamin Peterson29060642009-01-31 22:14:21 +00005325 if (ch >= 0x110000)
5326 {
5327 errmsg = "codepoint not in range(0x110000)";
5328 startinpos = ((const char *)q)-starts;
5329 endinpos = startinpos+4;
5330 goto utf32Error;
5331 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005332 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5333 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 q += 4;
5335 continue;
5336 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 if (unicode_decode_call_errorhandler(
5338 errors, &errorHandler,
5339 "utf32", errmsg,
5340 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005341 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005342 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005343 }
5344
5345 if (byteorder)
5346 *byteorder = bo;
5347
5348 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005349 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005350
5351 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005352 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005353 goto onError;
5354
5355 Py_XDECREF(errorHandler);
5356 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005357 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005358
Benjamin Peterson29060642009-01-31 22:14:21 +00005359 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005360 Py_DECREF(unicode);
5361 Py_XDECREF(errorHandler);
5362 Py_XDECREF(exc);
5363 return NULL;
5364}
5365
5366PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005367_PyUnicode_EncodeUTF32(PyObject *str,
5368 const char *errors,
5369 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005370{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005371 int kind;
5372 void *data;
5373 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005374 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005375 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005376 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005377 /* Offsets from p for storing byte pairs in the right order. */
5378#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5379 int iorder[] = {0, 1, 2, 3};
5380#else
5381 int iorder[] = {3, 2, 1, 0};
5382#endif
5383
Benjamin Peterson29060642009-01-31 22:14:21 +00005384#define STORECHAR(CH) \
5385 do { \
5386 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5387 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5388 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5389 p[iorder[0]] = (CH) & 0xff; \
5390 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005391 } while(0)
5392
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005393 if (!PyUnicode_Check(str)) {
5394 PyErr_BadArgument();
5395 return NULL;
5396 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005397 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005398 return NULL;
5399 kind = PyUnicode_KIND(str);
5400 data = PyUnicode_DATA(str);
5401 len = PyUnicode_GET_LENGTH(str);
5402
5403 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005404 bytesize = nsize * 4;
5405 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005406 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005407 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005408 if (v == NULL)
5409 return NULL;
5410
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005411 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005412 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005413 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005414 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005415 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005416
5417 if (byteorder == -1) {
5418 /* force LE */
5419 iorder[0] = 0;
5420 iorder[1] = 1;
5421 iorder[2] = 2;
5422 iorder[3] = 3;
5423 }
5424 else if (byteorder == 1) {
5425 /* force BE */
5426 iorder[0] = 3;
5427 iorder[1] = 2;
5428 iorder[2] = 1;
5429 iorder[3] = 0;
5430 }
5431
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005432 for (i = 0; i < len; i++)
5433 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005434
5435 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005436 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005437#undef STORECHAR
5438}
5439
Alexander Belopolsky40018472011-02-26 01:02:56 +00005440PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005441PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5442 Py_ssize_t size,
5443 const char *errors,
5444 int byteorder)
5445{
5446 PyObject *result;
5447 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5448 if (tmp == NULL)
5449 return NULL;
5450 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5451 Py_DECREF(tmp);
5452 return result;
5453}
5454
5455PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005456PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005457{
Victor Stinnerb960b342011-11-20 19:12:52 +01005458 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005459}
5460
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461/* --- UTF-16 Codec ------------------------------------------------------- */
5462
Tim Peters772747b2001-08-09 22:21:55 +00005463PyObject *
5464PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005465 Py_ssize_t size,
5466 const char *errors,
5467 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468{
Walter Dörwald69652032004-09-07 20:24:22 +00005469 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5470}
5471
Antoine Pitrouab868312009-01-10 15:40:25 +00005472/* Two masks for fast checking of whether a C 'long' may contain
5473 UTF16-encoded surrogate characters. This is an efficient heuristic,
5474 assuming that non-surrogate characters with a code point >= 0x8000 are
5475 rare in most input.
5476 FAST_CHAR_MASK is used when the input is in native byte ordering,
5477 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005478*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005479#if (SIZEOF_LONG == 8)
5480# define FAST_CHAR_MASK 0x8000800080008000L
5481# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005482# define STRIPPED_MASK 0x00FF00FF00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005483#elif (SIZEOF_LONG == 4)
5484# define FAST_CHAR_MASK 0x80008000L
5485# define SWAPPED_FAST_CHAR_MASK 0x00800080L
Victor Stinnerafb52052012-04-05 22:54:49 +02005486# define STRIPPED_MASK 0x00FF00FFL
Antoine Pitrouab868312009-01-10 15:40:25 +00005487#else
5488# error C 'long' size should be either 4 or 8!
5489#endif
5490
Walter Dörwald69652032004-09-07 20:24:22 +00005491PyObject *
5492PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 Py_ssize_t size,
5494 const char *errors,
5495 int *byteorder,
5496 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005497{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005498 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005499 Py_ssize_t startinpos;
5500 Py_ssize_t endinpos;
5501 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005502 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005503 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005504 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005505 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005506 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005507 /* Offsets from q for retrieving byte pairs in the right order. */
5508#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5509 int ihi = 1, ilo = 0;
5510#else
5511 int ihi = 0, ilo = 1;
5512#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005513 PyObject *errorHandler = NULL;
5514 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515
5516 /* Note: size will always be longer than the resulting Unicode
5517 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005518 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005519 if (!unicode)
5520 return NULL;
5521 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005522 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005523 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524
Tim Peters772747b2001-08-09 22:21:55 +00005525 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005526 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527
5528 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005529 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005531 /* Check for BOM marks (U+FEFF) in the input and adjust current
5532 byte order setting accordingly. In native mode, the leading BOM
5533 mark is skipped, in all other modes, it is copied to the output
5534 stream as-is (giving a ZWNBSP character). */
5535 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005536 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005537 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005538#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 if (bom == 0xFEFF) {
5540 q += 2;
5541 bo = -1;
5542 }
5543 else if (bom == 0xFFFE) {
5544 q += 2;
5545 bo = 1;
5546 }
Tim Petersced69f82003-09-16 20:30:58 +00005547#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005548 if (bom == 0xFEFF) {
5549 q += 2;
5550 bo = 1;
5551 }
5552 else if (bom == 0xFFFE) {
5553 q += 2;
5554 bo = -1;
5555 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005556#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005557 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005558 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559
Tim Peters772747b2001-08-09 22:21:55 +00005560 if (bo == -1) {
5561 /* force LE */
5562 ihi = 1;
5563 ilo = 0;
5564 }
5565 else if (bo == 1) {
5566 /* force BE */
5567 ihi = 0;
5568 ilo = 1;
5569 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005570#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5571 native_ordering = ilo < ihi;
5572#else
5573 native_ordering = ilo > ihi;
5574#endif
Tim Peters772747b2001-08-09 22:21:55 +00005575
Antoine Pitrouab868312009-01-10 15:40:25 +00005576 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005577 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005578 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005579 /* First check for possible aligned read of a C 'long'. Unaligned
5580 reads are more expensive, better to defer to another iteration. */
5581 if (!((size_t) q & LONG_PTR_MASK)) {
5582 /* Fast path for runs of non-surrogate chars. */
5583 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005584 int kind = PyUnicode_KIND(unicode);
5585 void *data = PyUnicode_DATA(unicode);
5586 while (_q < aligned_end) {
Victor Stinnerafb52052012-04-05 22:54:49 +02005587 unsigned long block = * (unsigned long *) _q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005588 Py_UCS4 maxch;
5589 if (native_ordering) {
5590 /* Can use buffer directly */
Victor Stinnerafb52052012-04-05 22:54:49 +02005591 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005592 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005593 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005594 else {
5595 /* Need to byte-swap */
Victor Stinnerafb52052012-04-05 22:54:49 +02005596 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005597 break;
Victor Stinnerafb52052012-04-05 22:54:49 +02005598 block = ((block >> 8) & STRIPPED_MASK) |
5599 ((block & STRIPPED_MASK) << 8);
Antoine Pitrouab868312009-01-10 15:40:25 +00005600 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005601 maxch = (Py_UCS2)(block & 0xFFFF);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005602#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005603 ch = (Py_UCS2)((block >> 16) & 0xFFFF);
Victor Stinnere6abb482012-05-02 01:15:40 +02005604 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005605 ch = (Py_UCS2)((block >> 32) & 0xFFFF);
Victor Stinnere6abb482012-05-02 01:15:40 +02005606 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005607 ch = (Py_UCS2)(block >> 48);
Victor Stinnere6abb482012-05-02 01:15:40 +02005608 maxch = MAX_MAXCHAR(maxch, ch);
Victor Stinnerafb52052012-04-05 22:54:49 +02005609#else
5610 ch = (Py_UCS2)(block >> 16);
Victor Stinnere6abb482012-05-02 01:15:40 +02005611 maxch = MAX_MAXCHAR(maxch, ch);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005612#endif
5613 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5614 if (unicode_widen(&unicode, maxch) < 0)
5615 goto onError;
5616 kind = PyUnicode_KIND(unicode);
5617 data = PyUnicode_DATA(unicode);
5618 }
Victor Stinnerafb52052012-04-05 22:54:49 +02005619#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5620 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005621#if SIZEOF_LONG == 8
Victor Stinnerafb52052012-04-05 22:54:49 +02005622 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5623 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5624 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5625#else
5626 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5627#endif
5628#else
5629#if SIZEOF_LONG == 8
5630 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48)));
5631 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF));
5632 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF));
5633#else
5634 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16));
5635#endif
5636 PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005637#endif
5638 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005639 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005640 q = _q;
5641 if (q >= e)
5642 break;
5643 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005644 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005645
Benjamin Peterson14339b62009-01-31 16:36:08 +00005646 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005647
Victor Stinner551ac952011-11-29 22:58:13 +01005648 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005649 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5650 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005651 continue;
5652 }
5653
5654 /* UTF-16 code pair: */
5655 if (q > e) {
5656 errmsg = "unexpected end of data";
5657 startinpos = (((const char *)q) - 2) - starts;
5658 endinpos = ((const char *)e) + 1 - starts;
5659 goto utf16Error;
5660 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005661 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5662 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005663 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005664 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005665 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005666 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005667 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005668 continue;
5669 }
5670 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005671 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005672 startinpos = (((const char *)q)-4)-starts;
5673 endinpos = startinpos+2;
5674 goto utf16Error;
5675 }
5676
Benjamin Peterson14339b62009-01-31 16:36:08 +00005677 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005678 errmsg = "illegal encoding";
5679 startinpos = (((const char *)q)-2)-starts;
5680 endinpos = startinpos+2;
5681 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005682
Benjamin Peterson29060642009-01-31 22:14:21 +00005683 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005685 errors,
5686 &errorHandler,
5687 "utf16", errmsg,
5688 &starts,
5689 (const char **)&e,
5690 &startinpos,
5691 &endinpos,
5692 &exc,
5693 (const char **)&q,
5694 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005695 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005696 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005698 /* remaining byte at the end? (size should be even) */
5699 if (e == q) {
5700 if (!consumed) {
5701 errmsg = "truncated data";
5702 startinpos = ((const char *)q) - starts;
5703 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005704 if (unicode_decode_call_errorhandler(
5705 errors,
5706 &errorHandler,
5707 "utf16", errmsg,
5708 &starts,
5709 (const char **)&e,
5710 &startinpos,
5711 &endinpos,
5712 &exc,
5713 (const char **)&q,
5714 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005715 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005716 goto onError;
5717 /* The remaining input chars are ignored if the callback
5718 chooses to skip the input */
5719 }
5720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721
5722 if (byteorder)
5723 *byteorder = bo;
5724
Walter Dörwald69652032004-09-07 20:24:22 +00005725 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005727
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005729 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 goto onError;
5731
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005732 Py_XDECREF(errorHandler);
5733 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005734 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005737 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005738 Py_XDECREF(errorHandler);
5739 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 return NULL;
5741}
5742
Antoine Pitrouab868312009-01-10 15:40:25 +00005743#undef FAST_CHAR_MASK
5744#undef SWAPPED_FAST_CHAR_MASK
5745
Tim Peters772747b2001-08-09 22:21:55 +00005746PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005747_PyUnicode_EncodeUTF16(PyObject *str,
5748 const char *errors,
5749 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005751 int kind;
5752 void *data;
5753 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005754 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005755 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005756 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005757 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005758 /* Offsets from p for storing byte pairs in the right order. */
5759#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5760 int ihi = 1, ilo = 0;
5761#else
5762 int ihi = 0, ilo = 1;
5763#endif
5764
Benjamin Peterson29060642009-01-31 22:14:21 +00005765#define STORECHAR(CH) \
5766 do { \
5767 p[ihi] = ((CH) >> 8) & 0xff; \
5768 p[ilo] = (CH) & 0xff; \
5769 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005770 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005772 if (!PyUnicode_Check(str)) {
5773 PyErr_BadArgument();
5774 return NULL;
5775 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005776 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005777 return NULL;
5778 kind = PyUnicode_KIND(str);
5779 data = PyUnicode_DATA(str);
5780 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005781
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005782 pairs = 0;
5783 if (kind == PyUnicode_4BYTE_KIND)
5784 for (i = 0; i < len; i++)
5785 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5786 pairs++;
5787 /* 2 * (len + pairs + (byteorder == 0)) */
5788 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005789 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005790 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005791 bytesize = nsize * 2;
5792 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005794 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795 if (v == NULL)
5796 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005798 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005800 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005801 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005802 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005803
5804 if (byteorder == -1) {
5805 /* force LE */
5806 ihi = 1;
5807 ilo = 0;
5808 }
5809 else if (byteorder == 1) {
5810 /* force BE */
5811 ihi = 0;
5812 ilo = 1;
5813 }
5814
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005815 for (i = 0; i < len; i++) {
5816 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5817 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005819 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5820 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 }
Tim Peters772747b2001-08-09 22:21:55 +00005822 STORECHAR(ch);
5823 if (ch2)
5824 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005825 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005826
5827 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005828 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005829#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830}
5831
Alexander Belopolsky40018472011-02-26 01:02:56 +00005832PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005833PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5834 Py_ssize_t size,
5835 const char *errors,
5836 int byteorder)
5837{
5838 PyObject *result;
5839 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5840 if (tmp == NULL)
5841 return NULL;
5842 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5843 Py_DECREF(tmp);
5844 return result;
5845}
5846
5847PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005848PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005850 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851}
5852
5853/* --- Unicode Escape Codec ----------------------------------------------- */
5854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005855/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5856 if all the escapes in the string make it still a valid ASCII string.
5857 Returns -1 if any escapes were found which cause the string to
5858 pop out of ASCII range. Otherwise returns the length of the
5859 required buffer to hold the string.
5860 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005861static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005862length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5863{
5864 const unsigned char *p = (const unsigned char *)s;
5865 const unsigned char *end = p + size;
5866 Py_ssize_t length = 0;
5867
5868 if (size < 0)
5869 return -1;
5870
5871 for (; p < end; ++p) {
5872 if (*p > 127) {
5873 /* Non-ASCII */
5874 return -1;
5875 }
5876 else if (*p != '\\') {
5877 /* Normal character */
5878 ++length;
5879 }
5880 else {
5881 /* Backslash-escape, check next char */
5882 ++p;
5883 /* Escape sequence reaches till end of string or
5884 non-ASCII follow-up. */
5885 if (p >= end || *p > 127)
5886 return -1;
5887 switch (*p) {
5888 case '\n':
5889 /* backslash + \n result in zero characters */
5890 break;
5891 case '\\': case '\'': case '\"':
5892 case 'b': case 'f': case 't':
5893 case 'n': case 'r': case 'v': case 'a':
5894 ++length;
5895 break;
5896 case '0': case '1': case '2': case '3':
5897 case '4': case '5': case '6': case '7':
5898 case 'x': case 'u': case 'U': case 'N':
5899 /* these do not guarantee ASCII characters */
5900 return -1;
5901 default:
5902 /* count the backslash + the other character */
5903 length += 2;
5904 }
5905 }
5906 }
5907 return length;
5908}
5909
Fredrik Lundh06d12682001-01-24 07:59:11 +00005910static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005911
Alexander Belopolsky40018472011-02-26 01:02:56 +00005912PyObject *
5913PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005914 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005915 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005917 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005918 Py_ssize_t startinpos;
5919 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005920 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005921 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005923 char* message;
5924 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005925 PyObject *errorHandler = NULL;
5926 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005927 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005928 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005929
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005930 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005931
5932 /* After length_of_escaped_ascii_string() there are two alternatives,
5933 either the string is pure ASCII with named escapes like \n, etc.
5934 and we determined it's exact size (common case)
5935 or it contains \x, \u, ... escape sequences. then we create a
5936 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005937 if (len >= 0) {
5938 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005939 if (!v)
5940 goto onError;
5941 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005942 }
5943 else {
5944 /* Escaped strings will always be longer than the resulting
5945 Unicode string, so we start with size here and then reduce the
5946 length after conversion to the true value.
5947 (but if the error callback returns a long replacement string
5948 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005949 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005950 if (!v)
5951 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005952 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005953 }
5954
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005956 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005957 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005959
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 while (s < end) {
5961 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005962 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005963 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005965 /* The only case in which i == ascii_length is a backslash
5966 followed by a newline. */
5967 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005968
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969 /* Non-escape characters are interpreted as Unicode ordinals */
5970 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005971 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5972 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973 continue;
5974 }
5975
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005976 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005977 /* \ - Escapes */
5978 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005979 c = *s++;
5980 if (s > end)
5981 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005982
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005983 /* The only case in which i == ascii_length is a backslash
5984 followed by a newline. */
5985 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005986
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005987 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005990#define WRITECHAR(ch) \
5991 do { \
5992 if (unicode_putchar(&v, &i, ch) < 0) \
5993 goto onError; \
5994 }while(0)
5995
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005997 case '\\': WRITECHAR('\\'); break;
5998 case '\'': WRITECHAR('\''); break;
5999 case '\"': WRITECHAR('\"'); break;
6000 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006001 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006002 case 'f': WRITECHAR('\014'); break;
6003 case 't': WRITECHAR('\t'); break;
6004 case 'n': WRITECHAR('\n'); break;
6005 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006006 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006007 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006008 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006009 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012 case '0': case '1': case '2': case '3':
6013 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006014 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006015 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006016 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006017 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00006018 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006019 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006020 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 break;
6022
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 /* hex escapes */
6024 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006026 digits = 2;
6027 message = "truncated \\xXX escape";
6028 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029
Benjamin Peterson29060642009-01-31 22:14:21 +00006030 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006032 digits = 4;
6033 message = "truncated \\uXXXX escape";
6034 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006037 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006038 digits = 8;
6039 message = "truncated \\UXXXXXXXX escape";
6040 hexescape:
6041 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006042 if (s+digits>end) {
6043 endinpos = size;
6044 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006045 errors, &errorHandler,
6046 "unicodeescape", "end of string in escape sequence",
6047 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006048 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006049 goto onError;
6050 goto nextByte;
6051 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006052 for (j = 0; j < digits; ++j) {
6053 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00006054 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006055 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 errors, &errorHandler,
6058 "unicodeescape", message,
6059 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006060 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006061 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006062 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006063 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00006064 }
6065 chr = (chr<<4) & ~0xF;
6066 if (c >= '0' && c <= '9')
6067 chr += c - '0';
6068 else if (c >= 'a' && c <= 'f')
6069 chr += 10 + c - 'a';
6070 else
6071 chr += 10 + c - 'A';
6072 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006073 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00006074 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006075 /* _decoding_error will have already written into the
6076 target buffer. */
6077 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006078 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00006079 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01006080 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006081 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00006082 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006083 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006084 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006085 errors, &errorHandler,
6086 "unicodeescape", "illegal Unicode character",
6087 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006088 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00006089 goto onError;
6090 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006091 break;
6092
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006094 case 'N':
6095 message = "malformed \\N character escape";
6096 if (ucnhash_CAPI == NULL) {
6097 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006098 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6099 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00006100 if (ucnhash_CAPI == NULL)
6101 goto ucnhashError;
6102 }
6103 if (*s == '{') {
6104 const char *start = s+1;
6105 /* look for the closing brace */
6106 while (*s != '}' && s < end)
6107 s++;
6108 if (s > start && s < end && *s == '}') {
6109 /* found a name. look it up in the unicode database */
6110 message = "unknown Unicode character name";
6111 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006112 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03006113 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006114 goto store;
6115 }
6116 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006117 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006118 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 errors, &errorHandler,
6120 "unicodeescape", message,
6121 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006122 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00006123 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006124 break;
6125
6126 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00006127 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006128 message = "\\ at end of string";
6129 s--;
6130 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 errors, &errorHandler,
6133 "unicodeescape", message,
6134 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006135 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00006136 goto onError;
6137 }
6138 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006139 WRITECHAR('\\');
6140 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00006141 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006142 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006145 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006147#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006148
Victor Stinner16e6a802011-12-12 13:24:15 +01006149 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006150 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006151 Py_XDECREF(errorHandler);
6152 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006153 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00006154
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00006156 PyErr_SetString(
6157 PyExc_UnicodeError,
6158 "\\N escapes not supported (can't load unicodedata module)"
6159 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00006160 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006161 Py_XDECREF(errorHandler);
6162 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00006163 return NULL;
6164
Benjamin Peterson29060642009-01-31 22:14:21 +00006165 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006167 Py_XDECREF(errorHandler);
6168 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 return NULL;
6170}
6171
6172/* Return a Unicode-Escape string version of the Unicode object.
6173
6174 If quotes is true, the string is enclosed in u"" or u'' quotes as
6175 appropriate.
6176
6177*/
6178
Alexander Belopolsky40018472011-02-26 01:02:56 +00006179PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006182 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006183 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006185 int kind;
6186 void *data;
6187 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188
Thomas Wouters89f507f2006-12-13 04:49:30 +00006189 /* Initial allocation is based on the longest-possible unichr
6190 escape.
6191
6192 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
6193 unichr, so in this case it's the longest unichr escape. In
6194 narrow (UTF-16) builds this is five chars per source unichr
6195 since there are two unichrs in the surrogate pair, so in narrow
6196 (UTF-16) builds it's not the longest unichr escape.
6197
6198 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
6199 so in the narrow (UTF-16) build case it's the longest unichr
6200 escape.
6201 */
6202
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006203 if (!PyUnicode_Check(unicode)) {
6204 PyErr_BadArgument();
6205 return NULL;
6206 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006207 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006208 return NULL;
6209 len = PyUnicode_GET_LENGTH(unicode);
6210 kind = PyUnicode_KIND(unicode);
6211 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06006212 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006213 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
6214 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
6215 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
6216 }
6217
6218 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006219 return PyBytes_FromStringAndSize(NULL, 0);
6220
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006221 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006222 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006223
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006224 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00006225 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006226 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00006227 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 if (repr == NULL)
6229 return NULL;
6230
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006231 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006233 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006234 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006235
Walter Dörwald79e913e2007-05-12 11:08:06 +00006236 /* Escape backslashes */
6237 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 *p++ = '\\';
6239 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006240 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006241 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006242
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006243 /* Map 21-bit characters to '\U00xxxxxx' */
6244 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006245 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006246 *p++ = '\\';
6247 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006248 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6249 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6250 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6251 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6252 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6253 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6254 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6255 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006257 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006258
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006260 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261 *p++ = '\\';
6262 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006263 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6264 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6265 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6266 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006268
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006269 /* Map special whitespace to '\t', \n', '\r' */
6270 else if (ch == '\t') {
6271 *p++ = '\\';
6272 *p++ = 't';
6273 }
6274 else if (ch == '\n') {
6275 *p++ = '\\';
6276 *p++ = 'n';
6277 }
6278 else if (ch == '\r') {
6279 *p++ = '\\';
6280 *p++ = 'r';
6281 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006282
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006283 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006284 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006286 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006287 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6288 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006289 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006290
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 /* Copy everything else as-is */
6292 else
6293 *p++ = (char) ch;
6294 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006296 assert(p - PyBytes_AS_STRING(repr) > 0);
6297 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6298 return NULL;
6299 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300}
6301
Alexander Belopolsky40018472011-02-26 01:02:56 +00006302PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006303PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6304 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006306 PyObject *result;
6307 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6308 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006310 result = PyUnicode_AsUnicodeEscapeString(tmp);
6311 Py_DECREF(tmp);
6312 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313}
6314
6315/* --- Raw Unicode Escape Codec ------------------------------------------- */
6316
Alexander Belopolsky40018472011-02-26 01:02:56 +00006317PyObject *
6318PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006319 Py_ssize_t size,
6320 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006322 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006323 Py_ssize_t startinpos;
6324 Py_ssize_t endinpos;
6325 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006326 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 const char *end;
6328 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006329 PyObject *errorHandler = NULL;
6330 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006331
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332 /* Escaped strings will always be longer than the resulting
6333 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006334 length after conversion to the true value. (But decoding error
6335 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006336 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006337 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006339 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006340 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006341 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006342 end = s + size;
6343 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 unsigned char c;
6345 Py_UCS4 x;
6346 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006347 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 /* Non-escape characters are interpreted as Unicode ordinals */
6350 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006351 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6352 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006354 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 startinpos = s-starts;
6356
6357 /* \u-escapes are only interpreted iff the number of leading
6358 backslashes if odd */
6359 bs = s;
6360 for (;s < end;) {
6361 if (*s != '\\')
6362 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006363 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6364 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 }
6366 if (((s - bs) & 1) == 0 ||
6367 s >= end ||
6368 (*s != 'u' && *s != 'U')) {
6369 continue;
6370 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006371 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006372 count = *s=='u' ? 4 : 8;
6373 s++;
6374
6375 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 for (x = 0, i = 0; i < count; ++i, ++s) {
6377 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006378 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006379 endinpos = s-starts;
6380 if (unicode_decode_call_errorhandler(
6381 errors, &errorHandler,
6382 "rawunicodeescape", "truncated \\uXXXX",
6383 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006384 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006385 goto onError;
6386 goto nextByte;
6387 }
6388 x = (x<<4) & ~0xF;
6389 if (c >= '0' && c <= '9')
6390 x += c - '0';
6391 else if (c >= 'a' && c <= 'f')
6392 x += 10 + c - 'a';
6393 else
6394 x += 10 + c - 'A';
6395 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006396 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006397 if (unicode_putchar(&v, &outpos, x) < 0)
6398 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006399 } else {
6400 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006401 if (unicode_decode_call_errorhandler(
6402 errors, &errorHandler,
6403 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006405 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006407 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006408 nextByte:
6409 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006411 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006412 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006413 Py_XDECREF(errorHandler);
6414 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006415 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006416
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006419 Py_XDECREF(errorHandler);
6420 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 return NULL;
6422}
6423
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006424
Alexander Belopolsky40018472011-02-26 01:02:56 +00006425PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006426PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006428 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 char *p;
6430 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006431 Py_ssize_t expandsize, pos;
6432 int kind;
6433 void *data;
6434 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006436 if (!PyUnicode_Check(unicode)) {
6437 PyErr_BadArgument();
6438 return NULL;
6439 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006440 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006441 return NULL;
6442 kind = PyUnicode_KIND(unicode);
6443 data = PyUnicode_DATA(unicode);
6444 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006445 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6446 bytes, and 1 byte characters 4. */
6447 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006448
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006449 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006450 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006451
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006452 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 if (repr == NULL)
6454 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006455 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006456 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006457
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006458 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006459 for (pos = 0; pos < len; pos++) {
6460 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 /* Map 32-bit characters to '\Uxxxxxxxx' */
6462 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006463 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006464 *p++ = '\\';
6465 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006466 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6467 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6468 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6469 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6470 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6471 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6472 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6473 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006474 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006476 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477 *p++ = '\\';
6478 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006479 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6480 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6481 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6482 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006484 /* Copy everything else as-is */
6485 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486 *p++ = (char) ch;
6487 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006488
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006489 assert(p > q);
6490 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006491 return NULL;
6492 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493}
6494
Alexander Belopolsky40018472011-02-26 01:02:56 +00006495PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006496PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6497 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006499 PyObject *result;
6500 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6501 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006502 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006503 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6504 Py_DECREF(tmp);
6505 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506}
6507
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006508/* --- Unicode Internal Codec ------------------------------------------- */
6509
Alexander Belopolsky40018472011-02-26 01:02:56 +00006510PyObject *
6511_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006512 Py_ssize_t size,
6513 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006514{
6515 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006516 Py_ssize_t startinpos;
6517 Py_ssize_t endinpos;
6518 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006519 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006520 const char *end;
6521 const char *reason;
6522 PyObject *errorHandler = NULL;
6523 PyObject *exc = NULL;
6524
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006525 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006526 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006527 1))
6528 return NULL;
6529
Thomas Wouters89f507f2006-12-13 04:49:30 +00006530 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006531 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006532 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006534 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006535 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006536 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006537 end = s + size;
6538
6539 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006540 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006541 Py_UCS4 ch;
6542 /* We copy the raw representation one byte at a time because the
6543 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006544 ((char *) &uch)[0] = s[0];
6545 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006546#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006547 ((char *) &uch)[2] = s[2];
6548 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006549#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006550 ch = uch;
6551
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006552 /* We have to sanity check the raw data, otherwise doom looms for
6553 some malformed UCS-4 data. */
6554 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006555#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006556 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006557#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006558 end-s < Py_UNICODE_SIZE
6559 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006561 startinpos = s - starts;
6562 if (end-s < Py_UNICODE_SIZE) {
6563 endinpos = end-starts;
6564 reason = "truncated input";
6565 }
6566 else {
6567 endinpos = s - starts + Py_UNICODE_SIZE;
6568 reason = "illegal code point (> 0x10FFFF)";
6569 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006570 if (unicode_decode_call_errorhandler(
6571 errors, &errorHandler,
6572 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006573 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006574 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006575 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006576 continue;
6577 }
6578
6579 s += Py_UNICODE_SIZE;
6580#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006581 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006582 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006583 Py_UNICODE uch2;
6584 ((char *) &uch2)[0] = s[0];
6585 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006586 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006587 {
Victor Stinner551ac952011-11-29 22:58:13 +01006588 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006589 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006590 }
6591 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006592#endif
6593
6594 if (unicode_putchar(&v, &outpos, ch) < 0)
6595 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006596 }
6597
Victor Stinner16e6a802011-12-12 13:24:15 +01006598 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006599 goto onError;
6600 Py_XDECREF(errorHandler);
6601 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006602 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006603
Benjamin Peterson29060642009-01-31 22:14:21 +00006604 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006605 Py_XDECREF(v);
6606 Py_XDECREF(errorHandler);
6607 Py_XDECREF(exc);
6608 return NULL;
6609}
6610
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611/* --- Latin-1 Codec ------------------------------------------------------ */
6612
Alexander Belopolsky40018472011-02-26 01:02:56 +00006613PyObject *
6614PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006615 Py_ssize_t size,
6616 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006619 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620}
6621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006623static void
6624make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006625 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006626 PyObject *unicode,
6627 Py_ssize_t startpos, Py_ssize_t endpos,
6628 const char *reason)
6629{
6630 if (*exceptionObject == NULL) {
6631 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006632 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006633 encoding, unicode, startpos, endpos, reason);
6634 }
6635 else {
6636 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6637 goto onError;
6638 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6639 goto onError;
6640 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6641 goto onError;
6642 return;
6643 onError:
6644 Py_DECREF(*exceptionObject);
6645 *exceptionObject = NULL;
6646 }
6647}
6648
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006649/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006650static void
6651raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006652 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006653 PyObject *unicode,
6654 Py_ssize_t startpos, Py_ssize_t endpos,
6655 const char *reason)
6656{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006657 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006658 encoding, unicode, startpos, endpos, reason);
6659 if (*exceptionObject != NULL)
6660 PyCodec_StrictErrors(*exceptionObject);
6661}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006662
6663/* error handling callback helper:
6664 build arguments, call the callback and check the arguments,
6665 put the result into newpos and return the replacement string, which
6666 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006667static PyObject *
6668unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006669 PyObject **errorHandler,
6670 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006671 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006672 Py_ssize_t startpos, Py_ssize_t endpos,
6673 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006674{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006675 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006676 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006677 PyObject *restuple;
6678 PyObject *resunicode;
6679
6680 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006681 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006682 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006683 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006684 }
6685
Benjamin Petersonbac79492012-01-14 13:34:47 -05006686 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006687 return NULL;
6688 len = PyUnicode_GET_LENGTH(unicode);
6689
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006690 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006691 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006692 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006693 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694
6695 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006697 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006698 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006699 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006700 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006701 Py_DECREF(restuple);
6702 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006703 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006704 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006705 &resunicode, newpos)) {
6706 Py_DECREF(restuple);
6707 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006708 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006709 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6710 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6711 Py_DECREF(restuple);
6712 return NULL;
6713 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006715 *newpos = len + *newpos;
6716 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006717 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6718 Py_DECREF(restuple);
6719 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006720 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721 Py_INCREF(resunicode);
6722 Py_DECREF(restuple);
6723 return resunicode;
6724}
6725
Alexander Belopolsky40018472011-02-26 01:02:56 +00006726static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006727unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006728 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006729 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006730{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006731 /* input state */
6732 Py_ssize_t pos=0, size;
6733 int kind;
6734 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006735 /* output object */
6736 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006737 /* pointer into the output */
6738 char *str;
6739 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006740 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006741 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6742 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006743 PyObject *errorHandler = NULL;
6744 PyObject *exc = NULL;
6745 /* the following variable is used for caching string comparisons
6746 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6747 int known_errorHandler = -1;
6748
Benjamin Petersonbac79492012-01-14 13:34:47 -05006749 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006750 return NULL;
6751 size = PyUnicode_GET_LENGTH(unicode);
6752 kind = PyUnicode_KIND(unicode);
6753 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006754 /* allocate enough for a simple encoding without
6755 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006756 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006757 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006758 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006759 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006760 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006761 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006762 ressize = size;
6763
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006764 while (pos < size) {
6765 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006766
Benjamin Peterson29060642009-01-31 22:14:21 +00006767 /* can we encode this? */
6768 if (c<limit) {
6769 /* no overflow check, because we know that the space is enough */
6770 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006771 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006772 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006774 Py_ssize_t requiredsize;
6775 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006776 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006777 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006778 Py_ssize_t collstart = pos;
6779 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006780 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006781 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006782 ++collend;
6783 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6784 if (known_errorHandler==-1) {
6785 if ((errors==NULL) || (!strcmp(errors, "strict")))
6786 known_errorHandler = 1;
6787 else if (!strcmp(errors, "replace"))
6788 known_errorHandler = 2;
6789 else if (!strcmp(errors, "ignore"))
6790 known_errorHandler = 3;
6791 else if (!strcmp(errors, "xmlcharrefreplace"))
6792 known_errorHandler = 4;
6793 else
6794 known_errorHandler = 0;
6795 }
6796 switch (known_errorHandler) {
6797 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006798 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 goto onError;
6800 case 2: /* replace */
6801 while (collstart++<collend)
6802 *str++ = '?'; /* fall through */
6803 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006804 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 break;
6806 case 4: /* xmlcharrefreplace */
6807 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006808 /* determine replacement size */
6809 for (i = collstart, repsize = 0; i < collend; ++i) {
6810 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6811 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006812 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006813 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006815 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006816 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006817 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006819 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006821 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006823 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006824 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006826 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006828 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 if (requiredsize > ressize) {
6830 if (requiredsize<2*ressize)
6831 requiredsize = 2*ressize;
6832 if (_PyBytes_Resize(&res, requiredsize))
6833 goto onError;
6834 str = PyBytes_AS_STRING(res) + respos;
6835 ressize = requiredsize;
6836 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006837 /* generate replacement */
6838 for (i = collstart; i < collend; ++i) {
6839 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006840 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006841 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006842 break;
6843 default:
6844 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006845 encoding, reason, unicode, &exc,
6846 collstart, collend, &newpos);
6847 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006848 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006849 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006850 if (PyBytes_Check(repunicode)) {
6851 /* Directly copy bytes result to output. */
6852 repsize = PyBytes_Size(repunicode);
6853 if (repsize > 1) {
6854 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006855 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006856 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6857 Py_DECREF(repunicode);
6858 goto onError;
6859 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006860 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006861 ressize += repsize-1;
6862 }
6863 memcpy(str, PyBytes_AsString(repunicode), repsize);
6864 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006865 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006866 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006867 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006868 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006869 /* need more space? (at least enough for what we
6870 have+the replacement+the rest of the string, so
6871 we won't have to check space for encodable characters) */
6872 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006873 repsize = PyUnicode_GET_LENGTH(repunicode);
6874 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 if (requiredsize > ressize) {
6876 if (requiredsize<2*ressize)
6877 requiredsize = 2*ressize;
6878 if (_PyBytes_Resize(&res, requiredsize)) {
6879 Py_DECREF(repunicode);
6880 goto onError;
6881 }
6882 str = PyBytes_AS_STRING(res) + respos;
6883 ressize = requiredsize;
6884 }
6885 /* check if there is anything unencodable in the replacement
6886 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006887 for (i = 0; repsize-->0; ++i, ++str) {
6888 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006890 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006891 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 Py_DECREF(repunicode);
6893 goto onError;
6894 }
6895 *str = (char)c;
6896 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006897 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006898 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006899 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006900 }
6901 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006902 /* Resize if we allocated to much */
6903 size = str - PyBytes_AS_STRING(res);
6904 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006905 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006906 if (_PyBytes_Resize(&res, size) < 0)
6907 goto onError;
6908 }
6909
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006910 Py_XDECREF(errorHandler);
6911 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006912 return res;
6913
6914 onError:
6915 Py_XDECREF(res);
6916 Py_XDECREF(errorHandler);
6917 Py_XDECREF(exc);
6918 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006919}
6920
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006921/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006922PyObject *
6923PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006924 Py_ssize_t size,
6925 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006926{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006927 PyObject *result;
6928 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6929 if (unicode == NULL)
6930 return NULL;
6931 result = unicode_encode_ucs1(unicode, errors, 256);
6932 Py_DECREF(unicode);
6933 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934}
6935
Alexander Belopolsky40018472011-02-26 01:02:56 +00006936PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006937_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006938{
6939 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006940 PyErr_BadArgument();
6941 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006943 if (PyUnicode_READY(unicode) == -1)
6944 return NULL;
6945 /* Fast path: if it is a one-byte string, construct
6946 bytes object directly. */
6947 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6948 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6949 PyUnicode_GET_LENGTH(unicode));
6950 /* Non-Latin-1 characters present. Defer to above function to
6951 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006952 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006953}
6954
6955PyObject*
6956PyUnicode_AsLatin1String(PyObject *unicode)
6957{
6958 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959}
6960
6961/* --- 7-bit ASCII Codec -------------------------------------------------- */
6962
Alexander Belopolsky40018472011-02-26 01:02:56 +00006963PyObject *
6964PyUnicode_DecodeASCII(const char *s,
6965 Py_ssize_t size,
6966 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006967{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006968 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006969 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006970 int kind;
6971 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006972 Py_ssize_t startinpos;
6973 Py_ssize_t endinpos;
6974 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006975 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006976 int has_error;
6977 const unsigned char *p = (const unsigned char *)s;
6978 const unsigned char *end = p + size;
6979 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006980 PyObject *errorHandler = NULL;
6981 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006982
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006983 if (size == 0) {
6984 Py_INCREF(unicode_empty);
6985 return unicode_empty;
6986 }
6987
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006989 if (size == 1 && (unsigned char)s[0] < 128)
6990 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006991
Victor Stinner702c7342011-10-05 13:50:52 +02006992 has_error = 0;
6993 while (p < end && !has_error) {
6994 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6995 an explanation. */
6996 if (!((size_t) p & LONG_PTR_MASK)) {
6997 /* Help register allocation */
6998 register const unsigned char *_p = p;
6999 while (_p < aligned_end) {
7000 unsigned long value = *(unsigned long *) _p;
7001 if (value & ASCII_CHAR_MASK) {
7002 has_error = 1;
7003 break;
7004 }
7005 _p += SIZEOF_LONG;
7006 }
7007 if (_p == end)
7008 break;
7009 if (has_error)
7010 break;
7011 p = _p;
7012 }
7013 if (*p & 0x80) {
7014 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007015 break;
Victor Stinner702c7342011-10-05 13:50:52 +02007016 }
7017 else {
7018 ++p;
7019 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007020 }
Victor Stinner702c7342011-10-05 13:50:52 +02007021 if (!has_error)
7022 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00007023
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007024 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007025 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007027 kind = PyUnicode_KIND(v);
7028 data = PyUnicode_DATA(v);
7029 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007030 e = s + size;
7031 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 register unsigned char c = (unsigned char)*s;
7033 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007034 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 ++s;
7036 }
7037 else {
7038 startinpos = s-starts;
7039 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007040 if (unicode_decode_call_errorhandler(
7041 errors, &errorHandler,
7042 "ascii", "ordinal not in range(128)",
7043 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007044 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00007045 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007046 kind = PyUnicode_KIND(v);
7047 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007049 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007050 if (unicode_resize(&v, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007051 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007052 Py_XDECREF(errorHandler);
7053 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007054 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01007055 return v;
Tim Petersced69f82003-09-16 20:30:58 +00007056
Benjamin Peterson29060642009-01-31 22:14:21 +00007057 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007059 Py_XDECREF(errorHandler);
7060 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007061 return NULL;
7062}
7063
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007064/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007065PyObject *
7066PyUnicode_EncodeASCII(const Py_UNICODE *p,
7067 Py_ssize_t size,
7068 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007070 PyObject *result;
7071 PyObject *unicode = PyUnicode_FromUnicode(p, size);
7072 if (unicode == NULL)
7073 return NULL;
7074 result = unicode_encode_ucs1(unicode, errors, 128);
7075 Py_DECREF(unicode);
7076 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007077}
7078
Alexander Belopolsky40018472011-02-26 01:02:56 +00007079PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007080_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007081{
7082 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 PyErr_BadArgument();
7084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007085 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007086 if (PyUnicode_READY(unicode) == -1)
7087 return NULL;
7088 /* Fast path: if it is an ASCII-only string, construct bytes object
7089 directly. Else defer to above function to raise the exception. */
7090 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
7091 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7092 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007093 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007094}
7095
7096PyObject *
7097PyUnicode_AsASCIIString(PyObject *unicode)
7098{
7099 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007100}
7101
Victor Stinner99b95382011-07-04 14:23:54 +02007102#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007103
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007104/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007105
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007106#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007107#define NEED_RETRY
7108#endif
7109
Victor Stinner3a50e702011-10-18 21:21:00 +02007110#ifndef WC_ERR_INVALID_CHARS
7111# define WC_ERR_INVALID_CHARS 0x0080
7112#endif
7113
7114static char*
7115code_page_name(UINT code_page, PyObject **obj)
7116{
7117 *obj = NULL;
7118 if (code_page == CP_ACP)
7119 return "mbcs";
7120 if (code_page == CP_UTF7)
7121 return "CP_UTF7";
7122 if (code_page == CP_UTF8)
7123 return "CP_UTF8";
7124
7125 *obj = PyBytes_FromFormat("cp%u", code_page);
7126 if (*obj == NULL)
7127 return NULL;
7128 return PyBytes_AS_STRING(*obj);
7129}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007130
Alexander Belopolsky40018472011-02-26 01:02:56 +00007131static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007132is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007133{
7134 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007136
Victor Stinner3a50e702011-10-18 21:21:00 +02007137 if (!IsDBCSLeadByteEx(code_page, *curr))
7138 return 0;
7139
7140 prev = CharPrevExA(code_page, s, curr, 0);
7141 if (prev == curr)
7142 return 1;
7143 /* FIXME: This code is limited to "true" double-byte encodings,
7144 as it assumes an incomplete character consists of a single
7145 byte. */
7146 if (curr - prev == 2)
7147 return 1;
7148 if (!IsDBCSLeadByteEx(code_page, *prev))
7149 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007150 return 0;
7151}
7152
Victor Stinner3a50e702011-10-18 21:21:00 +02007153static DWORD
7154decode_code_page_flags(UINT code_page)
7155{
7156 if (code_page == CP_UTF7) {
7157 /* The CP_UTF7 decoder only supports flags=0 */
7158 return 0;
7159 }
7160 else
7161 return MB_ERR_INVALID_CHARS;
7162}
7163
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007164/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 * Decode a byte string from a Windows code page into unicode object in strict
7166 * mode.
7167 *
7168 * Returns consumed size if succeed, returns -2 on decode error, or raise a
7169 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007170 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007171static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007172decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007173 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 const char *in,
7175 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007176{
Victor Stinner3a50e702011-10-18 21:21:00 +02007177 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01007178 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007180
7181 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 assert(insize > 0);
7183 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
7184 if (outsize <= 0)
7185 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007186
7187 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007188 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01007189 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007190 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00007191 if (*v == NULL)
7192 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007193 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007194 }
7195 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01007198 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007199 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007200 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007201 }
7202
7203 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7205 if (outsize <= 0)
7206 goto error;
7207 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007208
Victor Stinner3a50e702011-10-18 21:21:00 +02007209error:
7210 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7211 return -2;
7212 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007213 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007214}
7215
Victor Stinner3a50e702011-10-18 21:21:00 +02007216/*
7217 * Decode a byte string from a code page into unicode object with an error
7218 * handler.
7219 *
7220 * Returns consumed size if succeed, or raise a WindowsError or
7221 * UnicodeDecodeError exception and returns -1 on error.
7222 */
7223static int
7224decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007225 PyObject **v,
7226 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007227 const char *errors)
7228{
7229 const char *startin = in;
7230 const char *endin = in + size;
7231 const DWORD flags = decode_code_page_flags(code_page);
7232 /* Ideally, we should get reason from FormatMessage. This is the Windows
7233 2000 English version of the message. */
7234 const char *reason = "No mapping for the Unicode character exists "
7235 "in the target code page.";
7236 /* each step cannot decode more than 1 character, but a character can be
7237 represented as a surrogate pair */
7238 wchar_t buffer[2], *startout, *out;
7239 int insize, outsize;
7240 PyObject *errorHandler = NULL;
7241 PyObject *exc = NULL;
7242 PyObject *encoding_obj = NULL;
7243 char *encoding;
7244 DWORD err;
7245 int ret = -1;
7246
7247 assert(size > 0);
7248
7249 encoding = code_page_name(code_page, &encoding_obj);
7250 if (encoding == NULL)
7251 return -1;
7252
7253 if (errors == NULL || strcmp(errors, "strict") == 0) {
7254 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7255 UnicodeDecodeError. */
7256 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7257 if (exc != NULL) {
7258 PyCodec_StrictErrors(exc);
7259 Py_CLEAR(exc);
7260 }
7261 goto error;
7262 }
7263
7264 if (*v == NULL) {
7265 /* Create unicode object */
7266 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7267 PyErr_NoMemory();
7268 goto error;
7269 }
Victor Stinnerab595942011-12-17 04:59:06 +01007270 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01007271 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 if (*v == NULL)
7273 goto error;
7274 startout = PyUnicode_AS_UNICODE(*v);
7275 }
7276 else {
7277 /* Extend unicode object */
7278 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7279 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7280 PyErr_NoMemory();
7281 goto error;
7282 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007283 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007284 goto error;
7285 startout = PyUnicode_AS_UNICODE(*v) + n;
7286 }
7287
7288 /* Decode the byte string character per character */
7289 out = startout;
7290 while (in < endin)
7291 {
7292 /* Decode a character */
7293 insize = 1;
7294 do
7295 {
7296 outsize = MultiByteToWideChar(code_page, flags,
7297 in, insize,
7298 buffer, Py_ARRAY_LENGTH(buffer));
7299 if (outsize > 0)
7300 break;
7301 err = GetLastError();
7302 if (err != ERROR_NO_UNICODE_TRANSLATION
7303 && err != ERROR_INSUFFICIENT_BUFFER)
7304 {
7305 PyErr_SetFromWindowsErr(0);
7306 goto error;
7307 }
7308 insize++;
7309 }
7310 /* 4=maximum length of a UTF-8 sequence */
7311 while (insize <= 4 && (in + insize) <= endin);
7312
7313 if (outsize <= 0) {
7314 Py_ssize_t startinpos, endinpos, outpos;
7315
7316 startinpos = in - startin;
7317 endinpos = startinpos + 1;
7318 outpos = out - PyUnicode_AS_UNICODE(*v);
7319 if (unicode_decode_call_errorhandler(
7320 errors, &errorHandler,
7321 encoding, reason,
7322 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007323 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007324 {
7325 goto error;
7326 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007327 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007328 }
7329 else {
7330 in += insize;
7331 memcpy(out, buffer, outsize * sizeof(wchar_t));
7332 out += outsize;
7333 }
7334 }
7335
7336 /* write a NUL character at the end */
7337 *out = 0;
7338
7339 /* Extend unicode object */
7340 outsize = out - startout;
7341 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007342 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007343 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007344 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007345
7346error:
7347 Py_XDECREF(encoding_obj);
7348 Py_XDECREF(errorHandler);
7349 Py_XDECREF(exc);
7350 return ret;
7351}
7352
Victor Stinner3a50e702011-10-18 21:21:00 +02007353static PyObject *
7354decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007355 const char *s, Py_ssize_t size,
7356 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007357{
Victor Stinner76a31a62011-11-04 00:05:13 +01007358 PyObject *v = NULL;
7359 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007360
Victor Stinner3a50e702011-10-18 21:21:00 +02007361 if (code_page < 0) {
7362 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7363 return NULL;
7364 }
7365
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007366 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007367 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007368
Victor Stinner76a31a62011-11-04 00:05:13 +01007369 do
7370 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007371#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007372 if (size > INT_MAX) {
7373 chunk_size = INT_MAX;
7374 final = 0;
7375 done = 0;
7376 }
7377 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007378#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007379 {
7380 chunk_size = (int)size;
7381 final = (consumed == NULL);
7382 done = 1;
7383 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007384
Victor Stinner76a31a62011-11-04 00:05:13 +01007385 /* Skip trailing lead-byte unless 'final' is set */
7386 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7387 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007388
Victor Stinner76a31a62011-11-04 00:05:13 +01007389 if (chunk_size == 0 && done) {
7390 if (v != NULL)
7391 break;
7392 Py_INCREF(unicode_empty);
7393 return unicode_empty;
7394 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007395
Victor Stinner76a31a62011-11-04 00:05:13 +01007396
7397 converted = decode_code_page_strict(code_page, &v,
7398 s, chunk_size);
7399 if (converted == -2)
7400 converted = decode_code_page_errors(code_page, &v,
7401 s, chunk_size,
7402 errors);
7403 assert(converted != 0);
7404
7405 if (converted < 0) {
7406 Py_XDECREF(v);
7407 return NULL;
7408 }
7409
7410 if (consumed)
7411 *consumed += converted;
7412
7413 s += converted;
7414 size -= converted;
7415 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007416
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007417 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007418}
7419
Alexander Belopolsky40018472011-02-26 01:02:56 +00007420PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007421PyUnicode_DecodeCodePageStateful(int code_page,
7422 const char *s,
7423 Py_ssize_t size,
7424 const char *errors,
7425 Py_ssize_t *consumed)
7426{
7427 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7428}
7429
7430PyObject *
7431PyUnicode_DecodeMBCSStateful(const char *s,
7432 Py_ssize_t size,
7433 const char *errors,
7434 Py_ssize_t *consumed)
7435{
7436 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7437}
7438
7439PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007440PyUnicode_DecodeMBCS(const char *s,
7441 Py_ssize_t size,
7442 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007443{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007444 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7445}
7446
Victor Stinner3a50e702011-10-18 21:21:00 +02007447static DWORD
7448encode_code_page_flags(UINT code_page, const char *errors)
7449{
7450 if (code_page == CP_UTF8) {
7451 if (winver.dwMajorVersion >= 6)
7452 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7453 and later */
7454 return WC_ERR_INVALID_CHARS;
7455 else
7456 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7457 return 0;
7458 }
7459 else if (code_page == CP_UTF7) {
7460 /* CP_UTF7 only supports flags=0 */
7461 return 0;
7462 }
7463 else {
7464 if (errors != NULL && strcmp(errors, "replace") == 0)
7465 return 0;
7466 else
7467 return WC_NO_BEST_FIT_CHARS;
7468 }
7469}
7470
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007471/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007472 * Encode a Unicode string to a Windows code page into a byte string in strict
7473 * mode.
7474 *
7475 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7476 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007477 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007478static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007479encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007480 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007481 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007482{
Victor Stinner554f3f02010-06-16 23:33:54 +00007483 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 BOOL *pusedDefaultChar = &usedDefaultChar;
7485 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007486 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007487 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007488 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007489 const DWORD flags = encode_code_page_flags(code_page, NULL);
7490 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007491 /* Create a substring so that we can get the UTF-16 representation
7492 of just the slice under consideration. */
7493 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007494
Martin v. Löwis3d325192011-11-04 18:23:06 +01007495 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007496
Victor Stinner3a50e702011-10-18 21:21:00 +02007497 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007498 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007500 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007501
Victor Stinner2fc507f2011-11-04 20:06:39 +01007502 substring = PyUnicode_Substring(unicode, offset, offset+len);
7503 if (substring == NULL)
7504 return -1;
7505 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7506 if (p == NULL) {
7507 Py_DECREF(substring);
7508 return -1;
7509 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007510
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007511 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007512 outsize = WideCharToMultiByte(code_page, flags,
7513 p, size,
7514 NULL, 0,
7515 NULL, pusedDefaultChar);
7516 if (outsize <= 0)
7517 goto error;
7518 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007519 if (pusedDefaultChar && *pusedDefaultChar) {
7520 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007521 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007522 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007523
Victor Stinner3a50e702011-10-18 21:21:00 +02007524 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007525 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007527 if (*outbytes == NULL) {
7528 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007530 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007531 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007532 }
7533 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007534 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007535 const Py_ssize_t n = PyBytes_Size(*outbytes);
7536 if (outsize > PY_SSIZE_T_MAX - n) {
7537 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007538 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007540 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007541 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7542 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007543 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007544 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007545 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007546 }
7547
7548 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007549 outsize = WideCharToMultiByte(code_page, flags,
7550 p, size,
7551 out, outsize,
7552 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007553 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007554 if (outsize <= 0)
7555 goto error;
7556 if (pusedDefaultChar && *pusedDefaultChar)
7557 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007558 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007559
Victor Stinner3a50e702011-10-18 21:21:00 +02007560error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007561 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007562 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7563 return -2;
7564 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007565 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007566}
7567
Victor Stinner3a50e702011-10-18 21:21:00 +02007568/*
7569 * Encode a Unicode string to a Windows code page into a byte string using a
7570 * error handler.
7571 *
7572 * Returns consumed characters if succeed, or raise a WindowsError and returns
7573 * -1 on other error.
7574 */
7575static int
7576encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007577 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007578 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007579{
Victor Stinner3a50e702011-10-18 21:21:00 +02007580 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007581 Py_ssize_t pos = unicode_offset;
7582 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007583 /* Ideally, we should get reason from FormatMessage. This is the Windows
7584 2000 English version of the message. */
7585 const char *reason = "invalid character";
7586 /* 4=maximum length of a UTF-8 sequence */
7587 char buffer[4];
7588 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7589 Py_ssize_t outsize;
7590 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007591 PyObject *errorHandler = NULL;
7592 PyObject *exc = NULL;
7593 PyObject *encoding_obj = NULL;
7594 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007595 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007596 PyObject *rep;
7597 int ret = -1;
7598
7599 assert(insize > 0);
7600
7601 encoding = code_page_name(code_page, &encoding_obj);
7602 if (encoding == NULL)
7603 return -1;
7604
7605 if (errors == NULL || strcmp(errors, "strict") == 0) {
7606 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7607 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007608 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007609 if (exc != NULL) {
7610 PyCodec_StrictErrors(exc);
7611 Py_DECREF(exc);
7612 }
7613 Py_XDECREF(encoding_obj);
7614 return -1;
7615 }
7616
7617 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7618 pusedDefaultChar = &usedDefaultChar;
7619 else
7620 pusedDefaultChar = NULL;
7621
7622 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7623 PyErr_NoMemory();
7624 goto error;
7625 }
7626 outsize = insize * Py_ARRAY_LENGTH(buffer);
7627
7628 if (*outbytes == NULL) {
7629 /* Create string object */
7630 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7631 if (*outbytes == NULL)
7632 goto error;
7633 out = PyBytes_AS_STRING(*outbytes);
7634 }
7635 else {
7636 /* Extend string object */
7637 Py_ssize_t n = PyBytes_Size(*outbytes);
7638 if (n > PY_SSIZE_T_MAX - outsize) {
7639 PyErr_NoMemory();
7640 goto error;
7641 }
7642 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7643 goto error;
7644 out = PyBytes_AS_STRING(*outbytes) + n;
7645 }
7646
7647 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007648 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007649 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007650 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7651 wchar_t chars[2];
7652 int charsize;
7653 if (ch < 0x10000) {
7654 chars[0] = (wchar_t)ch;
7655 charsize = 1;
7656 }
7657 else {
7658 ch -= 0x10000;
7659 chars[0] = 0xd800 + (ch >> 10);
7660 chars[1] = 0xdc00 + (ch & 0x3ff);
7661 charsize = 2;
7662 }
7663
Victor Stinner3a50e702011-10-18 21:21:00 +02007664 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007665 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007666 buffer, Py_ARRAY_LENGTH(buffer),
7667 NULL, pusedDefaultChar);
7668 if (outsize > 0) {
7669 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7670 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007671 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007672 memcpy(out, buffer, outsize);
7673 out += outsize;
7674 continue;
7675 }
7676 }
7677 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7678 PyErr_SetFromWindowsErr(0);
7679 goto error;
7680 }
7681
Victor Stinner3a50e702011-10-18 21:21:00 +02007682 rep = unicode_encode_call_errorhandler(
7683 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007684 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007685 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007686 if (rep == NULL)
7687 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007688 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007689
7690 if (PyBytes_Check(rep)) {
7691 outsize = PyBytes_GET_SIZE(rep);
7692 if (outsize != 1) {
7693 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7694 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7695 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7696 Py_DECREF(rep);
7697 goto error;
7698 }
7699 out = PyBytes_AS_STRING(*outbytes) + offset;
7700 }
7701 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7702 out += outsize;
7703 }
7704 else {
7705 Py_ssize_t i;
7706 enum PyUnicode_Kind kind;
7707 void *data;
7708
Benjamin Petersonbac79492012-01-14 13:34:47 -05007709 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007710 Py_DECREF(rep);
7711 goto error;
7712 }
7713
7714 outsize = PyUnicode_GET_LENGTH(rep);
7715 if (outsize != 1) {
7716 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7717 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7718 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7719 Py_DECREF(rep);
7720 goto error;
7721 }
7722 out = PyBytes_AS_STRING(*outbytes) + offset;
7723 }
7724 kind = PyUnicode_KIND(rep);
7725 data = PyUnicode_DATA(rep);
7726 for (i=0; i < outsize; i++) {
7727 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7728 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007729 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007730 encoding, unicode,
7731 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007732 "unable to encode error handler result to ASCII");
7733 Py_DECREF(rep);
7734 goto error;
7735 }
7736 *out = (unsigned char)ch;
7737 out++;
7738 }
7739 }
7740 Py_DECREF(rep);
7741 }
7742 /* write a NUL byte */
7743 *out = 0;
7744 outsize = out - PyBytes_AS_STRING(*outbytes);
7745 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7746 if (_PyBytes_Resize(outbytes, outsize) < 0)
7747 goto error;
7748 ret = 0;
7749
7750error:
7751 Py_XDECREF(encoding_obj);
7752 Py_XDECREF(errorHandler);
7753 Py_XDECREF(exc);
7754 return ret;
7755}
7756
Victor Stinner3a50e702011-10-18 21:21:00 +02007757static PyObject *
7758encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007759 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007760 const char *errors)
7761{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007762 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007763 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007764 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007765 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007766
Benjamin Petersonbac79492012-01-14 13:34:47 -05007767 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007768 return NULL;
7769 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007770
Victor Stinner3a50e702011-10-18 21:21:00 +02007771 if (code_page < 0) {
7772 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7773 return NULL;
7774 }
7775
Martin v. Löwis3d325192011-11-04 18:23:06 +01007776 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007777 return PyBytes_FromStringAndSize(NULL, 0);
7778
Victor Stinner7581cef2011-11-03 22:32:33 +01007779 offset = 0;
7780 do
7781 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007782#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007783 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007784 chunks. */
7785 if (len > INT_MAX/2) {
7786 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007787 done = 0;
7788 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007789 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007790#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007791 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007792 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007793 done = 1;
7794 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007795
Victor Stinner76a31a62011-11-04 00:05:13 +01007796 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007797 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007798 errors);
7799 if (ret == -2)
7800 ret = encode_code_page_errors(code_page, &outbytes,
7801 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007802 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007803 if (ret < 0) {
7804 Py_XDECREF(outbytes);
7805 return NULL;
7806 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007807
Victor Stinner7581cef2011-11-03 22:32:33 +01007808 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007809 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007810 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007811
Victor Stinner3a50e702011-10-18 21:21:00 +02007812 return outbytes;
7813}
7814
7815PyObject *
7816PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7817 Py_ssize_t size,
7818 const char *errors)
7819{
Victor Stinner7581cef2011-11-03 22:32:33 +01007820 PyObject *unicode, *res;
7821 unicode = PyUnicode_FromUnicode(p, size);
7822 if (unicode == NULL)
7823 return NULL;
7824 res = encode_code_page(CP_ACP, unicode, errors);
7825 Py_DECREF(unicode);
7826 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007827}
7828
7829PyObject *
7830PyUnicode_EncodeCodePage(int code_page,
7831 PyObject *unicode,
7832 const char *errors)
7833{
Victor Stinner7581cef2011-11-03 22:32:33 +01007834 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007835}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007836
Alexander Belopolsky40018472011-02-26 01:02:56 +00007837PyObject *
7838PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007839{
7840 if (!PyUnicode_Check(unicode)) {
7841 PyErr_BadArgument();
7842 return NULL;
7843 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007844 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007845}
7846
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007847#undef NEED_RETRY
7848
Victor Stinner99b95382011-07-04 14:23:54 +02007849#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007850
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851/* --- Character Mapping Codec -------------------------------------------- */
7852
Alexander Belopolsky40018472011-02-26 01:02:56 +00007853PyObject *
7854PyUnicode_DecodeCharmap(const char *s,
7855 Py_ssize_t size,
7856 PyObject *mapping,
7857 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007859 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007860 Py_ssize_t startinpos;
7861 Py_ssize_t endinpos;
7862 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007863 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007864 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007865 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007866 PyObject *errorHandler = NULL;
7867 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007868
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869 /* Default to Latin-1 */
7870 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007871 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007872
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007873 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007874 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007876 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007877 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007878 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007879 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007880 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007881 Py_ssize_t maplen;
7882 enum PyUnicode_Kind kind;
7883 void *data;
7884 Py_UCS4 x;
7885
Benjamin Petersonbac79492012-01-14 13:34:47 -05007886 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007887 return NULL;
7888
7889 maplen = PyUnicode_GET_LENGTH(mapping);
7890 data = PyUnicode_DATA(mapping);
7891 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 while (s < e) {
7893 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007896 x = PyUnicode_READ(kind, data, ch);
7897 else
7898 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007899
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007900 if (x == 0xfffe)
7901 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 startinpos = s-starts;
7904 endinpos = startinpos+1;
7905 if (unicode_decode_call_errorhandler(
7906 errors, &errorHandler,
7907 "charmap", "character maps to <undefined>",
7908 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007909 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 goto onError;
7911 }
7912 continue;
7913 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007914
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007915 if (unicode_putchar(&v, &outpos, x) < 0)
7916 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007917 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007918 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007919 }
7920 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 while (s < e) {
7922 unsigned char ch = *s;
7923 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007924
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7926 w = PyLong_FromLong((long)ch);
7927 if (w == NULL)
7928 goto onError;
7929 x = PyObject_GetItem(mapping, w);
7930 Py_DECREF(w);
7931 if (x == NULL) {
7932 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7933 /* No mapping found means: mapping is undefined. */
7934 PyErr_Clear();
7935 x = Py_None;
7936 Py_INCREF(x);
7937 } else
7938 goto onError;
7939 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007940
Benjamin Peterson29060642009-01-31 22:14:21 +00007941 /* Apply mapping */
7942 if (PyLong_Check(x)) {
7943 long value = PyLong_AS_LONG(x);
7944 if (value < 0 || value > 65535) {
7945 PyErr_SetString(PyExc_TypeError,
7946 "character mapping must be in range(65536)");
7947 Py_DECREF(x);
7948 goto onError;
7949 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007950 if (unicode_putchar(&v, &outpos, value) < 0)
7951 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007952 }
7953 else if (x == Py_None) {
7954 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007955 startinpos = s-starts;
7956 endinpos = startinpos+1;
7957 if (unicode_decode_call_errorhandler(
7958 errors, &errorHandler,
7959 "charmap", "character maps to <undefined>",
7960 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007961 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 Py_DECREF(x);
7963 goto onError;
7964 }
7965 Py_DECREF(x);
7966 continue;
7967 }
7968 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007969 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007970
Benjamin Petersonbac79492012-01-14 13:34:47 -05007971 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007972 goto onError;
7973 targetsize = PyUnicode_GET_LENGTH(x);
7974
7975 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007977 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007978 PyUnicode_READ_CHAR(x, 0)) < 0)
7979 goto onError;
7980 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007981 else if (targetsize > 1) {
7982 /* 1-n mapping */
7983 if (targetsize > extrachars) {
7984 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 Py_ssize_t needed = (targetsize - extrachars) + \
7986 (targetsize << 2);
7987 extrachars += needed;
7988 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007989 if (unicode_resize(&v,
7990 PyUnicode_GET_LENGTH(v) + needed) < 0)
7991 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 Py_DECREF(x);
7993 goto onError;
7994 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007996 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7997 goto onError;
7998 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7999 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 extrachars -= targetsize;
8001 }
8002 /* 1-0 mapping: skip the character */
8003 }
8004 else {
8005 /* wrong return value */
8006 PyErr_SetString(PyExc_TypeError,
8007 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008008 Py_DECREF(x);
8009 goto onError;
8010 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 Py_DECREF(x);
8012 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008013 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008014 }
Victor Stinner16e6a802011-12-12 13:24:15 +01008015 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01008016 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008017 Py_XDECREF(errorHandler);
8018 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008019 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00008020
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008022 Py_XDECREF(errorHandler);
8023 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 Py_XDECREF(v);
8025 return NULL;
8026}
8027
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008028/* Charmap encoding: the lookup table */
8029
Alexander Belopolsky40018472011-02-26 01:02:56 +00008030struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 PyObject_HEAD
8032 unsigned char level1[32];
8033 int count2, count3;
8034 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008035};
8036
8037static PyObject*
8038encoding_map_size(PyObject *obj, PyObject* args)
8039{
8040 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008041 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008043}
8044
8045static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008046 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 PyDoc_STR("Return the size (in bytes) of this object") },
8048 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008049};
8050
8051static void
8052encoding_map_dealloc(PyObject* o)
8053{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008054 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008055}
8056
8057static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008058 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 "EncodingMap", /*tp_name*/
8060 sizeof(struct encoding_map), /*tp_basicsize*/
8061 0, /*tp_itemsize*/
8062 /* methods */
8063 encoding_map_dealloc, /*tp_dealloc*/
8064 0, /*tp_print*/
8065 0, /*tp_getattr*/
8066 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008067 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 0, /*tp_repr*/
8069 0, /*tp_as_number*/
8070 0, /*tp_as_sequence*/
8071 0, /*tp_as_mapping*/
8072 0, /*tp_hash*/
8073 0, /*tp_call*/
8074 0, /*tp_str*/
8075 0, /*tp_getattro*/
8076 0, /*tp_setattro*/
8077 0, /*tp_as_buffer*/
8078 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8079 0, /*tp_doc*/
8080 0, /*tp_traverse*/
8081 0, /*tp_clear*/
8082 0, /*tp_richcompare*/
8083 0, /*tp_weaklistoffset*/
8084 0, /*tp_iter*/
8085 0, /*tp_iternext*/
8086 encoding_map_methods, /*tp_methods*/
8087 0, /*tp_members*/
8088 0, /*tp_getset*/
8089 0, /*tp_base*/
8090 0, /*tp_dict*/
8091 0, /*tp_descr_get*/
8092 0, /*tp_descr_set*/
8093 0, /*tp_dictoffset*/
8094 0, /*tp_init*/
8095 0, /*tp_alloc*/
8096 0, /*tp_new*/
8097 0, /*tp_free*/
8098 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008099};
8100
8101PyObject*
8102PyUnicode_BuildEncodingMap(PyObject* string)
8103{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008104 PyObject *result;
8105 struct encoding_map *mresult;
8106 int i;
8107 int need_dict = 0;
8108 unsigned char level1[32];
8109 unsigned char level2[512];
8110 unsigned char *mlevel1, *mlevel2, *mlevel3;
8111 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008112 int kind;
8113 void *data;
8114 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008116 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008117 PyErr_BadArgument();
8118 return NULL;
8119 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008120 kind = PyUnicode_KIND(string);
8121 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008122 memset(level1, 0xFF, sizeof level1);
8123 memset(level2, 0xFF, sizeof level2);
8124
8125 /* If there isn't a one-to-one mapping of NULL to \0,
8126 or if there are non-BMP characters, we need to use
8127 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008128 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008129 need_dict = 1;
8130 for (i = 1; i < 256; i++) {
8131 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132 ch = PyUnicode_READ(kind, data, i);
8133 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008134 need_dict = 1;
8135 break;
8136 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008137 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008138 /* unmapped character */
8139 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008140 l1 = ch >> 11;
8141 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008142 if (level1[l1] == 0xFF)
8143 level1[l1] = count2++;
8144 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008145 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008146 }
8147
8148 if (count2 >= 0xFF || count3 >= 0xFF)
8149 need_dict = 1;
8150
8151 if (need_dict) {
8152 PyObject *result = PyDict_New();
8153 PyObject *key, *value;
8154 if (!result)
8155 return NULL;
8156 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008157 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008158 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008159 if (!key || !value)
8160 goto failed1;
8161 if (PyDict_SetItem(result, key, value) == -1)
8162 goto failed1;
8163 Py_DECREF(key);
8164 Py_DECREF(value);
8165 }
8166 return result;
8167 failed1:
8168 Py_XDECREF(key);
8169 Py_XDECREF(value);
8170 Py_DECREF(result);
8171 return NULL;
8172 }
8173
8174 /* Create a three-level trie */
8175 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8176 16*count2 + 128*count3 - 1);
8177 if (!result)
8178 return PyErr_NoMemory();
8179 PyObject_Init(result, &EncodingMapType);
8180 mresult = (struct encoding_map*)result;
8181 mresult->count2 = count2;
8182 mresult->count3 = count3;
8183 mlevel1 = mresult->level1;
8184 mlevel2 = mresult->level23;
8185 mlevel3 = mresult->level23 + 16*count2;
8186 memcpy(mlevel1, level1, 32);
8187 memset(mlevel2, 0xFF, 16*count2);
8188 memset(mlevel3, 0, 128*count3);
8189 count3 = 0;
8190 for (i = 1; i < 256; i++) {
8191 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008192 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008193 /* unmapped character */
8194 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008195 o1 = PyUnicode_READ(kind, data, i)>>11;
8196 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008197 i2 = 16*mlevel1[o1] + o2;
8198 if (mlevel2[i2] == 0xFF)
8199 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008200 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008201 i3 = 128*mlevel2[i2] + o3;
8202 mlevel3[i3] = i;
8203 }
8204 return result;
8205}
8206
8207static int
Victor Stinner22168992011-11-20 17:09:18 +01008208encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008209{
8210 struct encoding_map *map = (struct encoding_map*)mapping;
8211 int l1 = c>>11;
8212 int l2 = (c>>7) & 0xF;
8213 int l3 = c & 0x7F;
8214 int i;
8215
Victor Stinner22168992011-11-20 17:09:18 +01008216 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008218 if (c == 0)
8219 return 0;
8220 /* level 1*/
8221 i = map->level1[l1];
8222 if (i == 0xFF) {
8223 return -1;
8224 }
8225 /* level 2*/
8226 i = map->level23[16*i+l2];
8227 if (i == 0xFF) {
8228 return -1;
8229 }
8230 /* level 3 */
8231 i = map->level23[16*map->count2 + 128*i + l3];
8232 if (i == 0) {
8233 return -1;
8234 }
8235 return i;
8236}
8237
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008238/* Lookup the character ch in the mapping. If the character
8239 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008240 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008241static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008242charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008243{
Christian Heimes217cfd12007-12-02 14:31:20 +00008244 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008245 PyObject *x;
8246
8247 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249 x = PyObject_GetItem(mapping, w);
8250 Py_DECREF(w);
8251 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8253 /* No mapping found means: mapping is undefined. */
8254 PyErr_Clear();
8255 x = Py_None;
8256 Py_INCREF(x);
8257 return x;
8258 } else
8259 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008261 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008263 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 long value = PyLong_AS_LONG(x);
8265 if (value < 0 || value > 255) {
8266 PyErr_SetString(PyExc_TypeError,
8267 "character mapping must be in range(256)");
8268 Py_DECREF(x);
8269 return NULL;
8270 }
8271 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008273 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 /* wrong return value */
8277 PyErr_Format(PyExc_TypeError,
8278 "character mapping must return integer, bytes or None, not %.400s",
8279 x->ob_type->tp_name);
8280 Py_DECREF(x);
8281 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282 }
8283}
8284
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008285static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008286charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008287{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008288 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8289 /* exponentially overallocate to minimize reallocations */
8290 if (requiredsize < 2*outsize)
8291 requiredsize = 2*outsize;
8292 if (_PyBytes_Resize(outobj, requiredsize))
8293 return -1;
8294 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008295}
8296
Benjamin Peterson14339b62009-01-31 16:36:08 +00008297typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008299} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008301 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302 space is available. Return a new reference to the object that
8303 was put in the output buffer, or Py_None, if the mapping was undefined
8304 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008305 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008306static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008307charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008308 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008309{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008310 PyObject *rep;
8311 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008312 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008313
Christian Heimes90aa7642007-12-19 02:45:37 +00008314 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008315 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008317 if (res == -1)
8318 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 if (outsize<requiredsize)
8320 if (charmapencode_resize(outobj, outpos, requiredsize))
8321 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008322 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 outstart[(*outpos)++] = (char)res;
8324 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008325 }
8326
8327 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008328 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008330 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 Py_DECREF(rep);
8332 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 if (PyLong_Check(rep)) {
8335 Py_ssize_t requiredsize = *outpos+1;
8336 if (outsize<requiredsize)
8337 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8338 Py_DECREF(rep);
8339 return enc_EXCEPTION;
8340 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008341 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008343 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 else {
8345 const char *repchars = PyBytes_AS_STRING(rep);
8346 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8347 Py_ssize_t requiredsize = *outpos+repsize;
8348 if (outsize<requiredsize)
8349 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8350 Py_DECREF(rep);
8351 return enc_EXCEPTION;
8352 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008353 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 memcpy(outstart + *outpos, repchars, repsize);
8355 *outpos += repsize;
8356 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008358 Py_DECREF(rep);
8359 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360}
8361
8362/* handle an error in PyUnicode_EncodeCharmap
8363 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008364static int
8365charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008366 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008368 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008369 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370{
8371 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008372 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008373 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008374 enum PyUnicode_Kind kind;
8375 void *data;
8376 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008378 Py_ssize_t collstartpos = *inpos;
8379 Py_ssize_t collendpos = *inpos+1;
8380 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 char *encoding = "charmap";
8382 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008383 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008384 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008385 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386
Benjamin Petersonbac79492012-01-14 13:34:47 -05008387 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008388 return -1;
8389 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008390 /* find all unencodable characters */
8391 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008392 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008393 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008394 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008395 val = encoding_map_lookup(ch, mapping);
8396 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 break;
8398 ++collendpos;
8399 continue;
8400 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008401
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008402 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8403 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 if (rep==NULL)
8405 return -1;
8406 else if (rep!=Py_None) {
8407 Py_DECREF(rep);
8408 break;
8409 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008410 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 }
8413 /* cache callback name lookup
8414 * (if not done yet, i.e. it's the first error) */
8415 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 if ((errors==NULL) || (!strcmp(errors, "strict")))
8417 *known_errorHandler = 1;
8418 else if (!strcmp(errors, "replace"))
8419 *known_errorHandler = 2;
8420 else if (!strcmp(errors, "ignore"))
8421 *known_errorHandler = 3;
8422 else if (!strcmp(errors, "xmlcharrefreplace"))
8423 *known_errorHandler = 4;
8424 else
8425 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008426 }
8427 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008428 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008429 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008430 return -1;
8431 case 2: /* replace */
8432 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 x = charmapencode_output('?', mapping, res, respos);
8434 if (x==enc_EXCEPTION) {
8435 return -1;
8436 }
8437 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008438 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 return -1;
8440 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008441 }
8442 /* fall through */
8443 case 3: /* ignore */
8444 *inpos = collendpos;
8445 break;
8446 case 4: /* xmlcharrefreplace */
8447 /* generate replacement (temporarily (mis)uses p) */
8448 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 char buffer[2+29+1+1];
8450 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008451 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 for (cp = buffer; *cp; ++cp) {
8453 x = charmapencode_output(*cp, mapping, res, respos);
8454 if (x==enc_EXCEPTION)
8455 return -1;
8456 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008457 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 return -1;
8459 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008460 }
8461 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008462 *inpos = collendpos;
8463 break;
8464 default:
8465 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008466 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008468 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008470 if (PyBytes_Check(repunicode)) {
8471 /* Directly copy bytes result to output. */
8472 Py_ssize_t outsize = PyBytes_Size(*res);
8473 Py_ssize_t requiredsize;
8474 repsize = PyBytes_Size(repunicode);
8475 requiredsize = *respos + repsize;
8476 if (requiredsize > outsize)
8477 /* Make room for all additional bytes. */
8478 if (charmapencode_resize(res, respos, requiredsize)) {
8479 Py_DECREF(repunicode);
8480 return -1;
8481 }
8482 memcpy(PyBytes_AsString(*res) + *respos,
8483 PyBytes_AsString(repunicode), repsize);
8484 *respos += repsize;
8485 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008486 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008487 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008488 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008489 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008490 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008491 Py_DECREF(repunicode);
8492 return -1;
8493 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008494 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008495 data = PyUnicode_DATA(repunicode);
8496 kind = PyUnicode_KIND(repunicode);
8497 for (index = 0; index < repsize; index++) {
8498 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8499 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008501 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 return -1;
8503 }
8504 else if (x==enc_FAILED) {
8505 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008506 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 return -1;
8508 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008509 }
8510 *inpos = newpos;
8511 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008512 }
8513 return 0;
8514}
8515
Alexander Belopolsky40018472011-02-26 01:02:56 +00008516PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008517_PyUnicode_EncodeCharmap(PyObject *unicode,
8518 PyObject *mapping,
8519 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008520{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008521 /* output object */
8522 PyObject *res = NULL;
8523 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008524 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008525 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008526 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008527 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008528 PyObject *errorHandler = NULL;
8529 PyObject *exc = NULL;
8530 /* the following variable is used for caching string comparisons
8531 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8532 * 3=ignore, 4=xmlcharrefreplace */
8533 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008534
Benjamin Petersonbac79492012-01-14 13:34:47 -05008535 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008536 return NULL;
8537 size = PyUnicode_GET_LENGTH(unicode);
8538
Guido van Rossumd57fd912000-03-10 22:53:23 +00008539 /* Default to Latin-1 */
8540 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008541 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008542
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008543 /* allocate enough for a simple encoding without
8544 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008545 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 if (res == NULL)
8547 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008548 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008550
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008551 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008552 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008554 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 if (x==enc_EXCEPTION) /* error */
8556 goto onError;
8557 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008558 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 &exc,
8560 &known_errorHandler, &errorHandler, errors,
8561 &res, &respos)) {
8562 goto onError;
8563 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008564 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 else
8566 /* done with this character => adjust input position */
8567 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008568 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008570 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008571 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008572 if (_PyBytes_Resize(&res, respos) < 0)
8573 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008574
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008575 Py_XDECREF(exc);
8576 Py_XDECREF(errorHandler);
8577 return res;
8578
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008580 Py_XDECREF(res);
8581 Py_XDECREF(exc);
8582 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008583 return NULL;
8584}
8585
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008586/* Deprecated */
8587PyObject *
8588PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8589 Py_ssize_t size,
8590 PyObject *mapping,
8591 const char *errors)
8592{
8593 PyObject *result;
8594 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8595 if (unicode == NULL)
8596 return NULL;
8597 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8598 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008599 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008600}
8601
Alexander Belopolsky40018472011-02-26 01:02:56 +00008602PyObject *
8603PyUnicode_AsCharmapString(PyObject *unicode,
8604 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008605{
8606 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 PyErr_BadArgument();
8608 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008609 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008610 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611}
8612
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008613/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008614static void
8615make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008616 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008617 Py_ssize_t startpos, Py_ssize_t endpos,
8618 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008620 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621 *exceptionObject = _PyUnicodeTranslateError_Create(
8622 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008623 }
8624 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8626 goto onError;
8627 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8628 goto onError;
8629 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8630 goto onError;
8631 return;
8632 onError:
8633 Py_DECREF(*exceptionObject);
8634 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635 }
8636}
8637
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008638/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008639static void
8640raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008641 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008642 Py_ssize_t startpos, Py_ssize_t endpos,
8643 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644{
8645 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008649}
8650
8651/* error handling callback helper:
8652 build arguments, call the callback and check the arguments,
8653 put the result into newpos and return the replacement string, which
8654 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008655static PyObject *
8656unicode_translate_call_errorhandler(const char *errors,
8657 PyObject **errorHandler,
8658 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008659 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008660 Py_ssize_t startpos, Py_ssize_t endpos,
8661 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008662{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008663 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008664
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008665 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666 PyObject *restuple;
8667 PyObject *resunicode;
8668
8669 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008671 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008673 }
8674
8675 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008677 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679
8680 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008682 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008684 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008685 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 Py_DECREF(restuple);
8687 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008688 }
8689 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 &resunicode, &i_newpos)) {
8691 Py_DECREF(restuple);
8692 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008693 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008694 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008695 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008696 else
8697 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8700 Py_DECREF(restuple);
8701 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008702 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008703 Py_INCREF(resunicode);
8704 Py_DECREF(restuple);
8705 return resunicode;
8706}
8707
8708/* Lookup the character ch in the mapping and put the result in result,
8709 which must be decrefed by the caller.
8710 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008711static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008713{
Christian Heimes217cfd12007-12-02 14:31:20 +00008714 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008715 PyObject *x;
8716
8717 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008719 x = PyObject_GetItem(mapping, w);
8720 Py_DECREF(w);
8721 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8723 /* No mapping found means: use 1:1 mapping. */
8724 PyErr_Clear();
8725 *result = NULL;
8726 return 0;
8727 } else
8728 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008729 }
8730 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 *result = x;
8732 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008733 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008734 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 long value = PyLong_AS_LONG(x);
8736 long max = PyUnicode_GetMax();
8737 if (value < 0 || value > max) {
8738 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008739 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 Py_DECREF(x);
8741 return -1;
8742 }
8743 *result = x;
8744 return 0;
8745 }
8746 else if (PyUnicode_Check(x)) {
8747 *result = x;
8748 return 0;
8749 }
8750 else {
8751 /* wrong return value */
8752 PyErr_SetString(PyExc_TypeError,
8753 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008754 Py_DECREF(x);
8755 return -1;
8756 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008757}
8758/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 if not reallocate and adjust various state variables.
8760 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008761static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008762charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008764{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008766 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 /* exponentially overallocate to minimize reallocations */
8768 if (requiredsize < 2 * oldsize)
8769 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8771 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008774 }
8775 return 0;
8776}
8777/* lookup the character, put the result in the output string and adjust
8778 various state variables. Return a new reference to the object that
8779 was put in the output buffer in *result, or Py_None, if the mapping was
8780 undefined (in which case no character was written).
8781 The called must decref result.
8782 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008783static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008784charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8785 PyObject *mapping, Py_UCS4 **output,
8786 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008787 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008789 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8790 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008792 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008793 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008795 }
8796 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008798 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008799 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008800 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008801 }
8802 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 Py_ssize_t repsize;
8804 if (PyUnicode_READY(*res) == -1)
8805 return -1;
8806 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008807 if (repsize==1) {
8808 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008810 }
8811 else if (repsize!=0) {
8812 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 Py_ssize_t requiredsize = *opos +
8814 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008815 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816 Py_ssize_t i;
8817 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008818 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819 for(i = 0; i < repsize; i++)
8820 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008822 }
8823 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008825 return 0;
8826}
8827
Alexander Belopolsky40018472011-02-26 01:02:56 +00008828PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008829_PyUnicode_TranslateCharmap(PyObject *input,
8830 PyObject *mapping,
8831 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008832{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008833 /* input object */
8834 char *idata;
8835 Py_ssize_t size, i;
8836 int kind;
8837 /* output buffer */
8838 Py_UCS4 *output = NULL;
8839 Py_ssize_t osize;
8840 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008841 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008842 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008843 char *reason = "character maps to <undefined>";
8844 PyObject *errorHandler = NULL;
8845 PyObject *exc = NULL;
8846 /* the following variable is used for caching string comparisons
8847 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8848 * 3=ignore, 4=xmlcharrefreplace */
8849 int known_errorHandler = -1;
8850
Guido van Rossumd57fd912000-03-10 22:53:23 +00008851 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008852 PyErr_BadArgument();
8853 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008854 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008856 if (PyUnicode_READY(input) == -1)
8857 return NULL;
8858 idata = (char*)PyUnicode_DATA(input);
8859 kind = PyUnicode_KIND(input);
8860 size = PyUnicode_GET_LENGTH(input);
8861 i = 0;
8862
8863 if (size == 0) {
8864 Py_INCREF(input);
8865 return input;
8866 }
8867
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008868 /* allocate enough for a simple 1:1 translation without
8869 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008870 osize = size;
8871 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8872 opos = 0;
8873 if (output == NULL) {
8874 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008878 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 /* try to encode it */
8880 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008881 if (charmaptranslate_output(input, i, mapping,
8882 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 Py_XDECREF(x);
8884 goto onError;
8885 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008886 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008887 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008889 else { /* untranslatable character */
8890 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8891 Py_ssize_t repsize;
8892 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008894 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008895 Py_ssize_t collstart = i;
8896 Py_ssize_t collend = i+1;
8897 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008898
Benjamin Peterson29060642009-01-31 22:14:21 +00008899 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 while (collend < size) {
8901 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008902 goto onError;
8903 Py_XDECREF(x);
8904 if (x!=Py_None)
8905 break;
8906 ++collend;
8907 }
8908 /* cache callback name lookup
8909 * (if not done yet, i.e. it's the first error) */
8910 if (known_errorHandler==-1) {
8911 if ((errors==NULL) || (!strcmp(errors, "strict")))
8912 known_errorHandler = 1;
8913 else if (!strcmp(errors, "replace"))
8914 known_errorHandler = 2;
8915 else if (!strcmp(errors, "ignore"))
8916 known_errorHandler = 3;
8917 else if (!strcmp(errors, "xmlcharrefreplace"))
8918 known_errorHandler = 4;
8919 else
8920 known_errorHandler = 0;
8921 }
8922 switch (known_errorHandler) {
8923 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 raise_translate_exception(&exc, input, collstart,
8925 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008926 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008927 case 2: /* replace */
8928 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008929 for (coll = collstart; coll<collend; coll++)
8930 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008931 /* fall through */
8932 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 break;
8935 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008936 /* generate replacement (temporarily (mis)uses i) */
8937 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 char buffer[2+29+1+1];
8939 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8941 if (charmaptranslate_makespace(&output, &osize,
8942 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 goto onError;
8944 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008947 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008948 break;
8949 default:
8950 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 reason, input, &exc,
8952 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008953 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008955 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008956 Py_DECREF(repunicode);
8957 goto onError;
8958 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 repsize = PyUnicode_GET_LENGTH(repunicode);
8961 if (charmaptranslate_makespace(&output, &osize,
8962 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008963 Py_DECREF(repunicode);
8964 goto onError;
8965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 for (uni2 = 0; repsize-->0; ++uni2)
8967 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8968 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008969 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008970 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008971 }
8972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8974 if (!res)
8975 goto onError;
8976 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008977 Py_XDECREF(exc);
8978 Py_XDECREF(errorHandler);
8979 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980
Benjamin Peterson29060642009-01-31 22:14:21 +00008981 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008983 Py_XDECREF(exc);
8984 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985 return NULL;
8986}
8987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008988/* Deprecated. Use PyUnicode_Translate instead. */
8989PyObject *
8990PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8991 Py_ssize_t size,
8992 PyObject *mapping,
8993 const char *errors)
8994{
8995 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8996 if (!unicode)
8997 return NULL;
8998 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8999}
9000
Alexander Belopolsky40018472011-02-26 01:02:56 +00009001PyObject *
9002PyUnicode_Translate(PyObject *str,
9003 PyObject *mapping,
9004 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005{
9006 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00009007
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 str = PyUnicode_FromObject(str);
9009 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009010 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012 Py_DECREF(str);
9013 return result;
Tim Petersced69f82003-09-16 20:30:58 +00009014
Benjamin Peterson29060642009-01-31 22:14:21 +00009015 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016 Py_XDECREF(str);
9017 return NULL;
9018}
Tim Petersced69f82003-09-16 20:30:58 +00009019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009020static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009021fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022{
9023 /* No need to call PyUnicode_READY(self) because this function is only
9024 called as a callback from fixup() which does it already. */
9025 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9026 const int kind = PyUnicode_KIND(self);
9027 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009028 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009029 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 Py_ssize_t i;
9031
9032 for (i = 0; i < len; ++i) {
9033 ch = PyUnicode_READ(kind, data, i);
9034 fixed = 0;
9035 if (ch > 127) {
9036 if (Py_UNICODE_ISSPACE(ch))
9037 fixed = ' ';
9038 else {
9039 const int decimal = Py_UNICODE_TODECIMAL(ch);
9040 if (decimal >= 0)
9041 fixed = '0' + decimal;
9042 }
9043 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009044 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02009045 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046 PyUnicode_WRITE(kind, data, i, fixed);
9047 }
Victor Stinnere6abb482012-05-02 01:15:40 +02009048 else
9049 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051 }
9052
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05009053 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054}
9055
9056PyObject *
9057_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9058{
9059 if (!PyUnicode_Check(unicode)) {
9060 PyErr_BadInternalCall();
9061 return NULL;
9062 }
9063 if (PyUnicode_READY(unicode) == -1)
9064 return NULL;
9065 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
9066 /* If the string is already ASCII, just return the same string */
9067 Py_INCREF(unicode);
9068 return unicode;
9069 }
Victor Stinner9310abb2011-10-05 00:59:23 +02009070 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071}
9072
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009073PyObject *
9074PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9075 Py_ssize_t length)
9076{
Victor Stinnerf0124502011-11-21 23:12:56 +01009077 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009078 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009079 Py_UCS4 maxchar;
9080 enum PyUnicode_Kind kind;
9081 void *data;
9082
Victor Stinner99d7ad02012-02-22 13:37:39 +01009083 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009084 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01009085 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009086 if (ch > 127) {
9087 int decimal = Py_UNICODE_TODECIMAL(ch);
9088 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009089 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02009090 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009091 }
9092 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009093
9094 /* Copy to a new string */
9095 decimal = PyUnicode_New(length, maxchar);
9096 if (decimal == NULL)
9097 return decimal;
9098 kind = PyUnicode_KIND(decimal);
9099 data = PyUnicode_DATA(decimal);
9100 /* Iterate over code points */
9101 for (i = 0; i < length; i++) {
9102 Py_UNICODE ch = s[i];
9103 if (ch > 127) {
9104 int decimal = Py_UNICODE_TODECIMAL(ch);
9105 if (decimal >= 0)
9106 ch = '0' + decimal;
9107 }
9108 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009109 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009110 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009111}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009112/* --- Decimal Encoder ---------------------------------------------------- */
9113
Alexander Belopolsky40018472011-02-26 01:02:56 +00009114int
9115PyUnicode_EncodeDecimal(Py_UNICODE *s,
9116 Py_ssize_t length,
9117 char *output,
9118 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009119{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009120 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009121 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009122 enum PyUnicode_Kind kind;
9123 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009124
9125 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009126 PyErr_BadArgument();
9127 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009128 }
9129
Victor Stinner42bf7752011-11-21 22:52:58 +01009130 unicode = PyUnicode_FromUnicode(s, length);
9131 if (unicode == NULL)
9132 return -1;
9133
Benjamin Petersonbac79492012-01-14 13:34:47 -05009134 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01009135 Py_DECREF(unicode);
9136 return -1;
9137 }
Victor Stinner42bf7752011-11-21 22:52:58 +01009138 kind = PyUnicode_KIND(unicode);
9139 data = PyUnicode_DATA(unicode);
9140
Victor Stinnerb84d7232011-11-22 01:50:07 +01009141 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009142 PyObject *exc;
9143 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009144 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009145 Py_ssize_t startpos;
9146
9147 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009148
Benjamin Peterson29060642009-01-31 22:14:21 +00009149 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009150 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009151 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009153 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009154 decimal = Py_UNICODE_TODECIMAL(ch);
9155 if (decimal >= 0) {
9156 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009157 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009158 continue;
9159 }
9160 if (0 < ch && ch < 256) {
9161 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009162 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009163 continue;
9164 }
Victor Stinner6345be92011-11-25 20:09:01 +01009165
Victor Stinner42bf7752011-11-21 22:52:58 +01009166 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009167 exc = NULL;
9168 raise_encode_exception(&exc, "decimal", unicode,
9169 startpos, startpos+1,
9170 "invalid decimal Unicode string");
9171 Py_XDECREF(exc);
9172 Py_DECREF(unicode);
9173 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009174 }
9175 /* 0-terminate the output string */
9176 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009177 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009178 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009179}
9180
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181/* --- Helpers ------------------------------------------------------------ */
9182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009183static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009184any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009185 Py_ssize_t start,
9186 Py_ssize_t end)
9187{
9188 int kind1, kind2, kind;
9189 void *buf1, *buf2;
9190 Py_ssize_t len1, len2, result;
9191
9192 kind1 = PyUnicode_KIND(s1);
9193 kind2 = PyUnicode_KIND(s2);
9194 kind = kind1 > kind2 ? kind1 : kind2;
9195 buf1 = PyUnicode_DATA(s1);
9196 buf2 = PyUnicode_DATA(s2);
9197 if (kind1 != kind)
9198 buf1 = _PyUnicode_AsKind(s1, kind);
9199 if (!buf1)
9200 return -2;
9201 if (kind2 != kind)
9202 buf2 = _PyUnicode_AsKind(s2, kind);
9203 if (!buf2) {
9204 if (kind1 != kind) PyMem_Free(buf1);
9205 return -2;
9206 }
9207 len1 = PyUnicode_GET_LENGTH(s1);
9208 len2 = PyUnicode_GET_LENGTH(s2);
9209
Victor Stinner794d5672011-10-10 03:21:36 +02009210 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009211 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009212 case PyUnicode_1BYTE_KIND:
9213 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9214 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9215 else
9216 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9217 break;
9218 case PyUnicode_2BYTE_KIND:
9219 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9220 break;
9221 case PyUnicode_4BYTE_KIND:
9222 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9223 break;
9224 default:
9225 assert(0); result = -2;
9226 }
9227 }
9228 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009229 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009230 case PyUnicode_1BYTE_KIND:
9231 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9232 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9233 else
9234 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9235 break;
9236 case PyUnicode_2BYTE_KIND:
9237 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9238 break;
9239 case PyUnicode_4BYTE_KIND:
9240 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9241 break;
9242 default:
9243 assert(0); result = -2;
9244 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245 }
9246
9247 if (kind1 != kind)
9248 PyMem_Free(buf1);
9249 if (kind2 != kind)
9250 PyMem_Free(buf2);
9251
9252 return result;
9253}
9254
9255Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009256_PyUnicode_InsertThousandsGrouping(
9257 PyObject *unicode, Py_ssize_t index,
9258 Py_ssize_t n_buffer,
9259 void *digits, Py_ssize_t n_digits,
9260 Py_ssize_t min_width,
9261 const char *grouping, PyObject *thousands_sep,
9262 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263{
Victor Stinner41a863c2012-02-24 00:37:51 +01009264 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009265 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009266 Py_ssize_t thousands_sep_len;
9267 Py_ssize_t len;
9268
9269 if (unicode != NULL) {
9270 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009271 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009272 }
9273 else {
9274 kind = PyUnicode_1BYTE_KIND;
9275 data = NULL;
9276 }
9277 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9278 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9279 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9280 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009281 if (thousands_sep_kind < kind) {
9282 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9283 if (!thousands_sep_data)
9284 return -1;
9285 }
9286 else {
9287 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9288 if (!data)
9289 return -1;
9290 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009291 }
9292
Benjamin Petersonead6b532011-12-20 17:23:42 -06009293 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009294 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009295 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009296 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009297 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009298 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009299 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009300 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009301 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009302 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009303 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009304 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009305 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009307 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009308 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009309 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009310 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009311 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009313 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009314 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009315 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009316 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009317 break;
9318 default:
9319 assert(0);
9320 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009321 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009322 if (unicode != NULL && thousands_sep_kind != kind) {
9323 if (thousands_sep_kind < kind)
9324 PyMem_Free(thousands_sep_data);
9325 else
9326 PyMem_Free(data);
9327 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009328 if (unicode == NULL) {
9329 *maxchar = 127;
9330 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009331 *maxchar = MAX_MAXCHAR(*maxchar,
9332 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009333 }
9334 }
9335 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336}
9337
9338
Thomas Wouters477c8d52006-05-27 19:21:47 +00009339/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009340#define ADJUST_INDICES(start, end, len) \
9341 if (end > len) \
9342 end = len; \
9343 else if (end < 0) { \
9344 end += len; \
9345 if (end < 0) \
9346 end = 0; \
9347 } \
9348 if (start < 0) { \
9349 start += len; \
9350 if (start < 0) \
9351 start = 0; \
9352 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009353
Alexander Belopolsky40018472011-02-26 01:02:56 +00009354Py_ssize_t
9355PyUnicode_Count(PyObject *str,
9356 PyObject *substr,
9357 Py_ssize_t start,
9358 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009360 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009361 PyObject* str_obj;
9362 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 int kind1, kind2, kind;
9364 void *buf1 = NULL, *buf2 = NULL;
9365 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009366
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009367 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009368 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009369 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009370 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009371 if (!sub_obj) {
9372 Py_DECREF(str_obj);
9373 return -1;
9374 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009375 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009376 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009377 Py_DECREF(str_obj);
9378 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009379 }
Tim Petersced69f82003-09-16 20:30:58 +00009380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009381 kind1 = PyUnicode_KIND(str_obj);
9382 kind2 = PyUnicode_KIND(sub_obj);
9383 kind = kind1 > kind2 ? kind1 : kind2;
9384 buf1 = PyUnicode_DATA(str_obj);
9385 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009386 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387 if (!buf1)
9388 goto onError;
9389 buf2 = PyUnicode_DATA(sub_obj);
9390 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009391 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009392 if (!buf2)
9393 goto onError;
9394 len1 = PyUnicode_GET_LENGTH(str_obj);
9395 len2 = PyUnicode_GET_LENGTH(sub_obj);
9396
9397 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009398 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009400 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9401 result = asciilib_count(
9402 ((Py_UCS1*)buf1) + start, end - start,
9403 buf2, len2, PY_SSIZE_T_MAX
9404 );
9405 else
9406 result = ucs1lib_count(
9407 ((Py_UCS1*)buf1) + start, end - start,
9408 buf2, len2, PY_SSIZE_T_MAX
9409 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 break;
9411 case PyUnicode_2BYTE_KIND:
9412 result = ucs2lib_count(
9413 ((Py_UCS2*)buf1) + start, end - start,
9414 buf2, len2, PY_SSIZE_T_MAX
9415 );
9416 break;
9417 case PyUnicode_4BYTE_KIND:
9418 result = ucs4lib_count(
9419 ((Py_UCS4*)buf1) + start, end - start,
9420 buf2, len2, PY_SSIZE_T_MAX
9421 );
9422 break;
9423 default:
9424 assert(0); result = 0;
9425 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009426
9427 Py_DECREF(sub_obj);
9428 Py_DECREF(str_obj);
9429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 if (kind1 != kind)
9431 PyMem_Free(buf1);
9432 if (kind2 != kind)
9433 PyMem_Free(buf2);
9434
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 onError:
9437 Py_DECREF(sub_obj);
9438 Py_DECREF(str_obj);
9439 if (kind1 != kind && buf1)
9440 PyMem_Free(buf1);
9441 if (kind2 != kind && buf2)
9442 PyMem_Free(buf2);
9443 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009444}
9445
Alexander Belopolsky40018472011-02-26 01:02:56 +00009446Py_ssize_t
9447PyUnicode_Find(PyObject *str,
9448 PyObject *sub,
9449 Py_ssize_t start,
9450 Py_ssize_t end,
9451 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009453 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009454
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009456 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009457 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009458 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009459 if (!sub) {
9460 Py_DECREF(str);
9461 return -2;
9462 }
9463 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9464 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009465 Py_DECREF(str);
9466 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467 }
Tim Petersced69f82003-09-16 20:30:58 +00009468
Victor Stinner794d5672011-10-10 03:21:36 +02009469 result = any_find_slice(direction,
9470 str, sub, start, end
9471 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009472
Guido van Rossumd57fd912000-03-10 22:53:23 +00009473 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009474 Py_DECREF(sub);
9475
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476 return result;
9477}
9478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479Py_ssize_t
9480PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9481 Py_ssize_t start, Py_ssize_t end,
9482 int direction)
9483{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009485 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009486 if (PyUnicode_READY(str) == -1)
9487 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009488 if (start < 0 || end < 0) {
9489 PyErr_SetString(PyExc_IndexError, "string index out of range");
9490 return -2;
9491 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492 if (end > PyUnicode_GET_LENGTH(str))
9493 end = PyUnicode_GET_LENGTH(str);
9494 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009495 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9496 kind, end-start, ch, direction);
9497 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009498 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009499 else
9500 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501}
9502
Alexander Belopolsky40018472011-02-26 01:02:56 +00009503static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009504tailmatch(PyObject *self,
9505 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009506 Py_ssize_t start,
9507 Py_ssize_t end,
9508 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 int kind_self;
9511 int kind_sub;
9512 void *data_self;
9513 void *data_sub;
9514 Py_ssize_t offset;
9515 Py_ssize_t i;
9516 Py_ssize_t end_sub;
9517
9518 if (PyUnicode_READY(self) == -1 ||
9519 PyUnicode_READY(substring) == -1)
9520 return 0;
9521
9522 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523 return 1;
9524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009525 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9526 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009527 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009528 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 kind_self = PyUnicode_KIND(self);
9531 data_self = PyUnicode_DATA(self);
9532 kind_sub = PyUnicode_KIND(substring);
9533 data_sub = PyUnicode_DATA(substring);
9534 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9535
9536 if (direction > 0)
9537 offset = end;
9538 else
9539 offset = start;
9540
9541 if (PyUnicode_READ(kind_self, data_self, offset) ==
9542 PyUnicode_READ(kind_sub, data_sub, 0) &&
9543 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9544 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9545 /* If both are of the same kind, memcmp is sufficient */
9546 if (kind_self == kind_sub) {
9547 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009548 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 data_sub,
9550 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009551 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 }
9553 /* otherwise we have to compare each character by first accesing it */
9554 else {
9555 /* We do not need to compare 0 and len(substring)-1 because
9556 the if statement above ensured already that they are equal
9557 when we end up here. */
9558 // TODO: honor direction and do a forward or backwards search
9559 for (i = 1; i < end_sub; ++i) {
9560 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9561 PyUnicode_READ(kind_sub, data_sub, i))
9562 return 0;
9563 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009564 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566 }
9567
9568 return 0;
9569}
9570
Alexander Belopolsky40018472011-02-26 01:02:56 +00009571Py_ssize_t
9572PyUnicode_Tailmatch(PyObject *str,
9573 PyObject *substr,
9574 Py_ssize_t start,
9575 Py_ssize_t end,
9576 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009577{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009578 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009579
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580 str = PyUnicode_FromObject(str);
9581 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009582 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583 substr = PyUnicode_FromObject(substr);
9584 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009585 Py_DECREF(str);
9586 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009587 }
Tim Petersced69f82003-09-16 20:30:58 +00009588
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009589 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009590 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009591 Py_DECREF(str);
9592 Py_DECREF(substr);
9593 return result;
9594}
9595
Guido van Rossumd57fd912000-03-10 22:53:23 +00009596/* Apply fixfct filter to the Unicode object self and return a
9597 reference to the modified object */
9598
Alexander Belopolsky40018472011-02-26 01:02:56 +00009599static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009600fixup(PyObject *self,
9601 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009602{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 PyObject *u;
9604 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009605 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009606
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009607 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009608 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009609 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009610 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 /* fix functions return the new maximum character in a string,
9613 if the kind of the resulting unicode object does not change,
9614 everything is fine. Otherwise we need to change the string kind
9615 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009616 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009617
9618 if (maxchar_new == 0) {
9619 /* no changes */;
9620 if (PyUnicode_CheckExact(self)) {
9621 Py_DECREF(u);
9622 Py_INCREF(self);
9623 return self;
9624 }
9625 else
9626 return u;
9627 }
9628
Victor Stinnere6abb482012-05-02 01:15:40 +02009629 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630
Victor Stinnereaab6042011-12-11 22:22:39 +01009631 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009633
9634 /* In case the maximum character changed, we need to
9635 convert the string to the new category. */
9636 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9637 if (v == NULL) {
9638 Py_DECREF(u);
9639 return NULL;
9640 }
9641 if (maxchar_new > maxchar_old) {
9642 /* If the maxchar increased so that the kind changed, not all
9643 characters are representable anymore and we need to fix the
9644 string again. This only happens in very few cases. */
9645 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
9646 maxchar_old = fixfct(v);
9647 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 }
9649 else {
Victor Stinnereaab6042011-12-11 22:22:39 +01009650 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009652 Py_DECREF(u);
9653 assert(_PyUnicode_CheckConsistency(v, 1));
9654 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655}
9656
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009657static PyObject *
9658ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009659{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009660 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9661 char *resdata, *data = PyUnicode_DATA(self);
9662 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009663
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009664 res = PyUnicode_New(len, 127);
9665 if (res == NULL)
9666 return NULL;
9667 resdata = PyUnicode_DATA(res);
9668 if (lower)
9669 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009670 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009671 _Py_bytes_upper(resdata, data, len);
9672 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673}
9674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009676handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009678 Py_ssize_t j;
9679 int final_sigma;
9680 Py_UCS4 c;
9681 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009682
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009683 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9684
9685 where ! is a negation and \p{xxx} is a character with property xxx.
9686 */
9687 for (j = i - 1; j >= 0; j--) {
9688 c = PyUnicode_READ(kind, data, j);
9689 if (!_PyUnicode_IsCaseIgnorable(c))
9690 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009692 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9693 if (final_sigma) {
9694 for (j = i + 1; j < length; j++) {
9695 c = PyUnicode_READ(kind, data, j);
9696 if (!_PyUnicode_IsCaseIgnorable(c))
9697 break;
9698 }
9699 final_sigma = j == length || !_PyUnicode_IsCased(c);
9700 }
9701 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009702}
9703
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009704static int
9705lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9706 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009707{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009708 /* Obscure special case. */
9709 if (c == 0x3A3) {
9710 mapped[0] = handle_capital_sigma(kind, data, length, i);
9711 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009712 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009713 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009714}
9715
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009716static Py_ssize_t
9717do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009718{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009719 Py_ssize_t i, k = 0;
9720 int n_res, j;
9721 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009722
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009723 c = PyUnicode_READ(kind, data, 0);
9724 n_res = _PyUnicode_ToUpperFull(c, mapped);
9725 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009726 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009727 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009728 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009729 for (i = 1; i < length; i++) {
9730 c = PyUnicode_READ(kind, data, i);
9731 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9732 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009733 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009734 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009735 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009736 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009737 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738}
9739
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009740static Py_ssize_t
9741do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9742 Py_ssize_t i, k = 0;
9743
9744 for (i = 0; i < length; i++) {
9745 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9746 int n_res, j;
9747 if (Py_UNICODE_ISUPPER(c)) {
9748 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9749 }
9750 else if (Py_UNICODE_ISLOWER(c)) {
9751 n_res = _PyUnicode_ToUpperFull(c, mapped);
9752 }
9753 else {
9754 n_res = 1;
9755 mapped[0] = c;
9756 }
9757 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009758 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009759 res[k++] = mapped[j];
9760 }
9761 }
9762 return k;
9763}
9764
9765static Py_ssize_t
9766do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9767 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009768{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009769 Py_ssize_t i, k = 0;
9770
9771 for (i = 0; i < length; i++) {
9772 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9773 int n_res, j;
9774 if (lower)
9775 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9776 else
9777 n_res = _PyUnicode_ToUpperFull(c, mapped);
9778 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009779 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009780 res[k++] = mapped[j];
9781 }
9782 }
9783 return k;
9784}
9785
9786static Py_ssize_t
9787do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9788{
9789 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9790}
9791
9792static Py_ssize_t
9793do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9794{
9795 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9796}
9797
Benjamin Petersone51757f2012-01-12 21:10:29 -05009798static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009799do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9800{
9801 Py_ssize_t i, k = 0;
9802
9803 for (i = 0; i < length; i++) {
9804 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9805 Py_UCS4 mapped[3];
9806 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9807 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009808 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009809 res[k++] = mapped[j];
9810 }
9811 }
9812 return k;
9813}
9814
9815static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009816do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9817{
9818 Py_ssize_t i, k = 0;
9819 int previous_is_cased;
9820
9821 previous_is_cased = 0;
9822 for (i = 0; i < length; i++) {
9823 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9824 Py_UCS4 mapped[3];
9825 int n_res, j;
9826
9827 if (previous_is_cased)
9828 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9829 else
9830 n_res = _PyUnicode_ToTitleFull(c, mapped);
9831
9832 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009833 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009834 res[k++] = mapped[j];
9835 }
9836
9837 previous_is_cased = _PyUnicode_IsCased(c);
9838 }
9839 return k;
9840}
9841
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009842static PyObject *
9843case_operation(PyObject *self,
9844 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9845{
9846 PyObject *res = NULL;
9847 Py_ssize_t length, newlength = 0;
9848 int kind, outkind;
9849 void *data, *outdata;
9850 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9851
Benjamin Petersoneea48462012-01-16 14:28:50 -05009852 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009853
9854 kind = PyUnicode_KIND(self);
9855 data = PyUnicode_DATA(self);
9856 length = PyUnicode_GET_LENGTH(self);
9857 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9858 if (tmp == NULL)
9859 return PyErr_NoMemory();
9860 newlength = perform(kind, data, length, tmp, &maxchar);
9861 res = PyUnicode_New(newlength, maxchar);
9862 if (res == NULL)
9863 goto leave;
9864 tmpend = tmp + newlength;
9865 outdata = PyUnicode_DATA(res);
9866 outkind = PyUnicode_KIND(res);
9867 switch (outkind) {
9868 case PyUnicode_1BYTE_KIND:
9869 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9870 break;
9871 case PyUnicode_2BYTE_KIND:
9872 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9873 break;
9874 case PyUnicode_4BYTE_KIND:
9875 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9876 break;
9877 default:
9878 assert(0);
9879 break;
9880 }
9881 leave:
9882 PyMem_FREE(tmp);
9883 return res;
9884}
9885
Tim Peters8ce9f162004-08-27 01:49:32 +00009886PyObject *
9887PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009890 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009892 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009893 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9894 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009895 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009897 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009899 int use_memcpy;
9900 unsigned char *res_data = NULL, *sep_data = NULL;
9901 PyObject *last_obj;
9902 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009903
Tim Peters05eba1f2004-08-27 21:32:02 +00009904 fseq = PySequence_Fast(seq, "");
9905 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009906 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009907 }
9908
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009909 /* NOTE: the following code can't call back into Python code,
9910 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009911 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009912
Tim Peters05eba1f2004-08-27 21:32:02 +00009913 seqlen = PySequence_Fast_GET_SIZE(fseq);
9914 /* If empty sequence, return u"". */
9915 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009916 Py_DECREF(fseq);
9917 Py_INCREF(unicode_empty);
9918 res = unicode_empty;
9919 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009920 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009921
Tim Peters05eba1f2004-08-27 21:32:02 +00009922 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009923 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009924 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009925 if (seqlen == 1) {
9926 if (PyUnicode_CheckExact(items[0])) {
9927 res = items[0];
9928 Py_INCREF(res);
9929 Py_DECREF(fseq);
9930 return res;
9931 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009932 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009933 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009934 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009935 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009936 /* Set up sep and seplen */
9937 if (separator == NULL) {
9938 /* fall back to a blank space separator */
9939 sep = PyUnicode_FromOrdinal(' ');
9940 if (!sep)
9941 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009942 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009943 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009944 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009945 else {
9946 if (!PyUnicode_Check(separator)) {
9947 PyErr_Format(PyExc_TypeError,
9948 "separator: expected str instance,"
9949 " %.80s found",
9950 Py_TYPE(separator)->tp_name);
9951 goto onError;
9952 }
9953 if (PyUnicode_READY(separator))
9954 goto onError;
9955 sep = separator;
9956 seplen = PyUnicode_GET_LENGTH(separator);
9957 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9958 /* inc refcount to keep this code path symmetric with the
9959 above case of a blank separator */
9960 Py_INCREF(sep);
9961 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009962 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009963 }
9964
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009965 /* There are at least two things to join, or else we have a subclass
9966 * of str in the sequence.
9967 * Do a pre-pass to figure out the total amount of space we'll
9968 * need (sz), and see whether all argument are strings.
9969 */
9970 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009971#ifdef Py_DEBUG
9972 use_memcpy = 0;
9973#else
9974 use_memcpy = 1;
9975#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009976 for (i = 0; i < seqlen; i++) {
9977 const Py_ssize_t old_sz = sz;
9978 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009979 if (!PyUnicode_Check(item)) {
9980 PyErr_Format(PyExc_TypeError,
9981 "sequence item %zd: expected str instance,"
9982 " %.80s found",
9983 i, Py_TYPE(item)->tp_name);
9984 goto onError;
9985 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 if (PyUnicode_READY(item) == -1)
9987 goto onError;
9988 sz += PyUnicode_GET_LENGTH(item);
9989 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009990 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009991 if (i != 0)
9992 sz += seplen;
9993 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9994 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009995 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009996 goto onError;
9997 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009998 if (use_memcpy && last_obj != NULL) {
9999 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10000 use_memcpy = 0;
10001 }
10002 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010003 }
Tim Petersced69f82003-09-16 20:30:58 +000010004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010006 if (res == NULL)
10007 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010008
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010009 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010010#ifdef Py_DEBUG
10011 use_memcpy = 0;
10012#else
10013 if (use_memcpy) {
10014 res_data = PyUnicode_1BYTE_DATA(res);
10015 kind = PyUnicode_KIND(res);
10016 if (seplen != 0)
10017 sep_data = PyUnicode_1BYTE_DATA(sep);
10018 }
10019#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010021 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010022 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010023 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +020010024 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010025 if (use_memcpy) {
10026 Py_MEMCPY(res_data,
10027 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010028 kind * seplen);
10029 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010030 }
10031 else {
10032 copy_characters(res, res_offset, sep, 0, seplen);
10033 res_offset += seplen;
10034 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010035 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010036 itemlen = PyUnicode_GET_LENGTH(item);
10037 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +020010038 if (use_memcpy) {
10039 Py_MEMCPY(res_data,
10040 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010041 kind * itemlen);
10042 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010043 }
10044 else {
10045 copy_characters(res, res_offset, item, 0, itemlen);
10046 res_offset += itemlen;
10047 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010048 }
Tim Peters05eba1f2004-08-27 21:32:02 +000010049 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010050 if (use_memcpy)
10051 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010052 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +020010053 else
10054 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +000010055
Tim Peters05eba1f2004-08-27 21:32:02 +000010056 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010058 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010060
Benjamin Peterson29060642009-01-31 22:14:21 +000010061 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +000010062 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010064 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010065 return NULL;
10066}
10067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068#define FILL(kind, data, value, start, length) \
10069 do { \
10070 Py_ssize_t i_ = 0; \
10071 assert(kind != PyUnicode_WCHAR_KIND); \
10072 switch ((kind)) { \
10073 case PyUnicode_1BYTE_KIND: { \
10074 unsigned char * to_ = (unsigned char *)((data)) + (start); \
10075 memset(to_, (unsigned char)value, length); \
10076 break; \
10077 } \
10078 case PyUnicode_2BYTE_KIND: { \
10079 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
10080 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10081 break; \
10082 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010083 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
10085 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
10086 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -060010087 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 } \
10089 } \
10090 } while (0)
10091
Victor Stinner3fe55312012-01-04 00:33:50 +010010092Py_ssize_t
10093PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10094 Py_UCS4 fill_char)
10095{
10096 Py_ssize_t maxlen;
10097 enum PyUnicode_Kind kind;
10098 void *data;
10099
10100 if (!PyUnicode_Check(unicode)) {
10101 PyErr_BadInternalCall();
10102 return -1;
10103 }
10104 if (PyUnicode_READY(unicode) == -1)
10105 return -1;
10106 if (unicode_check_modifiable(unicode))
10107 return -1;
10108
10109 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10110 PyErr_SetString(PyExc_ValueError,
10111 "fill character is bigger than "
10112 "the string maximum character");
10113 return -1;
10114 }
10115
10116 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10117 length = Py_MIN(maxlen, length);
10118 if (length <= 0)
10119 return 0;
10120
10121 kind = PyUnicode_KIND(unicode);
10122 data = PyUnicode_DATA(unicode);
10123 FILL(kind, data, fill_char, start, length);
10124 return length;
10125}
10126
Victor Stinner9310abb2011-10-05 00:59:23 +020010127static PyObject *
10128pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010129 Py_ssize_t left,
10130 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 PyObject *u;
10134 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010135 int kind;
10136 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137
10138 if (left < 0)
10139 left = 0;
10140 if (right < 0)
10141 right = 0;
10142
Victor Stinnerc4b49542011-12-11 22:44:26 +010010143 if (left == 0 && right == 0)
10144 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10147 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010148 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10149 return NULL;
10150 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +020010152 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010154 if (!u)
10155 return NULL;
10156
10157 kind = PyUnicode_KIND(u);
10158 data = PyUnicode_DATA(u);
10159 if (left)
10160 FILL(kind, data, fill, 0, left);
10161 if (right)
10162 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010163 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010164 assert(_PyUnicode_CheckConsistency(u, 1));
10165 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166}
10167
Alexander Belopolsky40018472011-02-26 01:02:56 +000010168PyObject *
10169PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172
10173 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010174 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010175 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060010176 if (PyUnicode_READY(string) == -1) {
10177 Py_DECREF(string);
10178 return NULL;
10179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180
Benjamin Petersonead6b532011-12-20 17:23:42 -060010181 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010183 if (PyUnicode_IS_ASCII(string))
10184 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010185 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010186 PyUnicode_GET_LENGTH(string), keepends);
10187 else
10188 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010189 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010190 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 break;
10192 case PyUnicode_2BYTE_KIND:
10193 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010194 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010195 PyUnicode_GET_LENGTH(string), keepends);
10196 break;
10197 case PyUnicode_4BYTE_KIND:
10198 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010199 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 PyUnicode_GET_LENGTH(string), keepends);
10201 break;
10202 default:
10203 assert(0);
10204 list = 0;
10205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206 Py_DECREF(string);
10207 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010208}
10209
Alexander Belopolsky40018472011-02-26 01:02:56 +000010210static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010211split(PyObject *self,
10212 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010213 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 int kind1, kind2, kind;
10216 void *buf1, *buf2;
10217 Py_ssize_t len1, len2;
10218 PyObject* out;
10219
Guido van Rossumd57fd912000-03-10 22:53:23 +000010220 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010221 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 if (PyUnicode_READY(self) == -1)
10224 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010227 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010229 if (PyUnicode_IS_ASCII(self))
10230 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010231 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010232 PyUnicode_GET_LENGTH(self), maxcount
10233 );
10234 else
10235 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010236 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010237 PyUnicode_GET_LENGTH(self), maxcount
10238 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 case PyUnicode_2BYTE_KIND:
10240 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010241 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 PyUnicode_GET_LENGTH(self), maxcount
10243 );
10244 case PyUnicode_4BYTE_KIND:
10245 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010246 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 PyUnicode_GET_LENGTH(self), maxcount
10248 );
10249 default:
10250 assert(0);
10251 return NULL;
10252 }
10253
10254 if (PyUnicode_READY(substring) == -1)
10255 return NULL;
10256
10257 kind1 = PyUnicode_KIND(self);
10258 kind2 = PyUnicode_KIND(substring);
10259 kind = kind1 > kind2 ? kind1 : kind2;
10260 buf1 = PyUnicode_DATA(self);
10261 buf2 = PyUnicode_DATA(substring);
10262 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010263 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 if (!buf1)
10265 return NULL;
10266 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010267 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 if (!buf2) {
10269 if (kind1 != kind) PyMem_Free(buf1);
10270 return NULL;
10271 }
10272 len1 = PyUnicode_GET_LENGTH(self);
10273 len2 = PyUnicode_GET_LENGTH(substring);
10274
Benjamin Petersonead6b532011-12-20 17:23:42 -060010275 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010277 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10278 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010279 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010280 else
10281 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010282 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 break;
10284 case PyUnicode_2BYTE_KIND:
10285 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010286 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 break;
10288 case PyUnicode_4BYTE_KIND:
10289 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010290 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 break;
10292 default:
10293 out = NULL;
10294 }
10295 if (kind1 != kind)
10296 PyMem_Free(buf1);
10297 if (kind2 != kind)
10298 PyMem_Free(buf2);
10299 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300}
10301
Alexander Belopolsky40018472011-02-26 01:02:56 +000010302static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010303rsplit(PyObject *self,
10304 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010305 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010306{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307 int kind1, kind2, kind;
10308 void *buf1, *buf2;
10309 Py_ssize_t len1, len2;
10310 PyObject* out;
10311
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010312 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010313 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010314
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 if (PyUnicode_READY(self) == -1)
10316 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010319 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010321 if (PyUnicode_IS_ASCII(self))
10322 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010323 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010324 PyUnicode_GET_LENGTH(self), maxcount
10325 );
10326 else
10327 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010328 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010329 PyUnicode_GET_LENGTH(self), maxcount
10330 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 case PyUnicode_2BYTE_KIND:
10332 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010333 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 PyUnicode_GET_LENGTH(self), maxcount
10335 );
10336 case PyUnicode_4BYTE_KIND:
10337 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010338 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 PyUnicode_GET_LENGTH(self), maxcount
10340 );
10341 default:
10342 assert(0);
10343 return NULL;
10344 }
10345
10346 if (PyUnicode_READY(substring) == -1)
10347 return NULL;
10348
10349 kind1 = PyUnicode_KIND(self);
10350 kind2 = PyUnicode_KIND(substring);
10351 kind = kind1 > kind2 ? kind1 : kind2;
10352 buf1 = PyUnicode_DATA(self);
10353 buf2 = PyUnicode_DATA(substring);
10354 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010355 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 if (!buf1)
10357 return NULL;
10358 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010359 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 if (!buf2) {
10361 if (kind1 != kind) PyMem_Free(buf1);
10362 return NULL;
10363 }
10364 len1 = PyUnicode_GET_LENGTH(self);
10365 len2 = PyUnicode_GET_LENGTH(substring);
10366
Benjamin Petersonead6b532011-12-20 17:23:42 -060010367 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010369 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10370 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010371 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010372 else
10373 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010374 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 break;
10376 case PyUnicode_2BYTE_KIND:
10377 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010378 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 break;
10380 case PyUnicode_4BYTE_KIND:
10381 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010382 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 break;
10384 default:
10385 out = NULL;
10386 }
10387 if (kind1 != kind)
10388 PyMem_Free(buf1);
10389 if (kind2 != kind)
10390 PyMem_Free(buf2);
10391 return out;
10392}
10393
10394static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010395anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10396 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010398 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010400 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10401 return asciilib_find(buf1, len1, buf2, len2, offset);
10402 else
10403 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 case PyUnicode_2BYTE_KIND:
10405 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10406 case PyUnicode_4BYTE_KIND:
10407 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10408 }
10409 assert(0);
10410 return -1;
10411}
10412
10413static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010414anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10415 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010417 switch (kind) {
10418 case PyUnicode_1BYTE_KIND:
10419 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10420 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10421 else
10422 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10423 case PyUnicode_2BYTE_KIND:
10424 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10425 case PyUnicode_4BYTE_KIND:
10426 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10427 }
10428 assert(0);
10429 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010430}
10431
Alexander Belopolsky40018472011-02-26 01:02:56 +000010432static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433replace(PyObject *self, PyObject *str1,
10434 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 PyObject *u;
10437 char *sbuf = PyUnicode_DATA(self);
10438 char *buf1 = PyUnicode_DATA(str1);
10439 char *buf2 = PyUnicode_DATA(str2);
10440 int srelease = 0, release1 = 0, release2 = 0;
10441 int skind = PyUnicode_KIND(self);
10442 int kind1 = PyUnicode_KIND(str1);
10443 int kind2 = PyUnicode_KIND(str2);
10444 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10445 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10446 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010447 int mayshrink;
10448 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010449
10450 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010451 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010453 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010454
Victor Stinner59de0ee2011-10-07 10:01:28 +020010455 if (str1 == str2)
10456 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 if (skind < kind1)
10458 /* substring too wide to be present */
10459 goto nothing;
10460
Victor Stinner49a0a212011-10-12 23:46:10 +020010461 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10462 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10463 /* Replacing str1 with str2 may cause a maxchar reduction in the
10464 result string. */
10465 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +020010466 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010467
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010469 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010471 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010472 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010473 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010474 Py_UCS4 u1, u2;
10475 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010476 Py_ssize_t index, pos;
10477 char *src;
10478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010479 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010480 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10481 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010482 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010485 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010487 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010489
10490 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10491 index = 0;
10492 src = sbuf;
10493 while (--maxcount)
10494 {
10495 pos++;
10496 src += pos * PyUnicode_KIND(self);
10497 slen -= pos;
10498 index += pos;
10499 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10500 if (pos < 0)
10501 break;
10502 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10503 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010504 }
10505 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 int rkind = skind;
10507 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010508 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 if (kind1 < rkind) {
10511 /* widen substring */
10512 buf1 = _PyUnicode_AsKind(str1, rkind);
10513 if (!buf1) goto error;
10514 release1 = 1;
10515 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010516 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010517 if (i < 0)
10518 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 if (rkind > kind2) {
10520 /* widen replacement */
10521 buf2 = _PyUnicode_AsKind(str2, rkind);
10522 if (!buf2) goto error;
10523 release2 = 1;
10524 }
10525 else if (rkind < kind2) {
10526 /* widen self and buf1 */
10527 rkind = kind2;
10528 if (release1) PyMem_Free(buf1);
10529 sbuf = _PyUnicode_AsKind(self, rkind);
10530 if (!sbuf) goto error;
10531 srelease = 1;
10532 buf1 = _PyUnicode_AsKind(str1, rkind);
10533 if (!buf1) goto error;
10534 release1 = 1;
10535 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010536 u = PyUnicode_New(slen, maxchar);
10537 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010539 assert(PyUnicode_KIND(u) == rkind);
10540 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010541
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010542 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010543 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010544 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010546 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010548
10549 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010550 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010551 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010552 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010553 if (i == -1)
10554 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010555 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010557 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010560 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010561 }
10562 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 Py_ssize_t n, i, j, ires;
10564 Py_ssize_t product, new_size;
10565 int rkind = skind;
10566 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010568 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010569 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 buf1 = _PyUnicode_AsKind(str1, rkind);
10571 if (!buf1) goto error;
10572 release1 = 1;
10573 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010574 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010575 if (n == 0)
10576 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010578 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 buf2 = _PyUnicode_AsKind(str2, rkind);
10580 if (!buf2) goto error;
10581 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010582 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010584 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 rkind = kind2;
10586 sbuf = _PyUnicode_AsKind(self, rkind);
10587 if (!sbuf) goto error;
10588 srelease = 1;
10589 if (release1) PyMem_Free(buf1);
10590 buf1 = _PyUnicode_AsKind(str1, rkind);
10591 if (!buf1) goto error;
10592 release1 = 1;
10593 }
10594 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10595 PyUnicode_GET_LENGTH(str1))); */
10596 product = n * (len2-len1);
10597 if ((product / (len2-len1)) != n) {
10598 PyErr_SetString(PyExc_OverflowError,
10599 "replace string is too long");
10600 goto error;
10601 }
10602 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010603 if (new_size == 0) {
10604 Py_INCREF(unicode_empty);
10605 u = unicode_empty;
10606 goto done;
10607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10609 PyErr_SetString(PyExc_OverflowError,
10610 "replace string is too long");
10611 goto error;
10612 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010613 u = PyUnicode_New(new_size, maxchar);
10614 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010616 assert(PyUnicode_KIND(u) == rkind);
10617 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 ires = i = 0;
10619 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010620 while (n-- > 0) {
10621 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010622 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010623 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010624 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010625 if (j == -1)
10626 break;
10627 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010628 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010629 memcpy(res + rkind * ires,
10630 sbuf + rkind * i,
10631 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010633 }
10634 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010636 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010638 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010642 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010644 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010645 memcpy(res + rkind * ires,
10646 sbuf + rkind * i,
10647 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010648 }
10649 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010650 /* interleave */
10651 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010652 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010654 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010656 if (--n <= 0)
10657 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010658 memcpy(res + rkind * ires,
10659 sbuf + rkind * i,
10660 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 ires++;
10662 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010663 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010664 memcpy(res + rkind * ires,
10665 sbuf + rkind * i,
10666 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010667 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010668 }
10669
10670 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010671 unicode_adjust_maxchar(&u);
10672 if (u == NULL)
10673 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010674 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010675
10676 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 if (srelease)
10678 PyMem_FREE(sbuf);
10679 if (release1)
10680 PyMem_FREE(buf1);
10681 if (release2)
10682 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010683 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010685
Benjamin Peterson29060642009-01-31 22:14:21 +000010686 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010687 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 if (srelease)
10689 PyMem_FREE(sbuf);
10690 if (release1)
10691 PyMem_FREE(buf1);
10692 if (release2)
10693 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010694 return unicode_result_unchanged(self);
10695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 error:
10697 if (srelease && sbuf)
10698 PyMem_FREE(sbuf);
10699 if (release1 && buf1)
10700 PyMem_FREE(buf1);
10701 if (release2 && buf2)
10702 PyMem_FREE(buf2);
10703 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704}
10705
10706/* --- Unicode Object Methods --------------------------------------------- */
10707
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010708PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010709 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710\n\
10711Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010712characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713
10714static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010715unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010717 if (PyUnicode_READY(self) == -1)
10718 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010719 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720}
10721
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010722PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010723 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724\n\
10725Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010726have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727
10728static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010729unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010731 if (PyUnicode_READY(self) == -1)
10732 return NULL;
10733 if (PyUnicode_GET_LENGTH(self) == 0)
10734 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010735 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736}
10737
Benjamin Petersond5890c82012-01-14 13:23:30 -050010738PyDoc_STRVAR(casefold__doc__,
10739 "S.casefold() -> str\n\
10740\n\
10741Return a version of S suitable for caseless comparisons.");
10742
10743static PyObject *
10744unicode_casefold(PyObject *self)
10745{
10746 if (PyUnicode_READY(self) == -1)
10747 return NULL;
10748 if (PyUnicode_IS_ASCII(self))
10749 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010750 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010751}
10752
10753
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010754/* Argument converter. Coerces to a single unicode character */
10755
10756static int
10757convert_uc(PyObject *obj, void *addr)
10758{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010760 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010761
Benjamin Peterson14339b62009-01-31 16:36:08 +000010762 uniobj = PyUnicode_FromObject(obj);
10763 if (uniobj == NULL) {
10764 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010765 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010766 return 0;
10767 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010769 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010770 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010771 Py_DECREF(uniobj);
10772 return 0;
10773 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010775 Py_DECREF(uniobj);
10776 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010777}
10778
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010779PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010780 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010782Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010783done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784
10785static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010786unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010787{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010788 Py_ssize_t marg, left;
10789 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 Py_UCS4 fillchar = ' ';
10791
Victor Stinnere9a29352011-10-01 02:14:59 +020010792 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010794
Benjamin Petersonbac79492012-01-14 13:34:47 -050010795 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010796 return NULL;
10797
Victor Stinnerc4b49542011-12-11 22:44:26 +010010798 if (PyUnicode_GET_LENGTH(self) >= width)
10799 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010800
Victor Stinnerc4b49542011-12-11 22:44:26 +010010801 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010802 left = marg / 2 + (marg & width & 1);
10803
Victor Stinner9310abb2011-10-05 00:59:23 +020010804 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805}
10806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807/* This function assumes that str1 and str2 are readied by the caller. */
10808
Marc-André Lemburge5034372000-08-08 08:04:29 +000010809static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010810unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010811{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 int kind1, kind2;
10813 void *data1, *data2;
10814 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 kind1 = PyUnicode_KIND(str1);
10817 kind2 = PyUnicode_KIND(str2);
10818 data1 = PyUnicode_DATA(str1);
10819 data2 = PyUnicode_DATA(str2);
10820 len1 = PyUnicode_GET_LENGTH(str1);
10821 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010823 for (i = 0; i < len1 && i < len2; ++i) {
10824 Py_UCS4 c1, c2;
10825 c1 = PyUnicode_READ(kind1, data1, i);
10826 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010827
10828 if (c1 != c2)
10829 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010830 }
10831
10832 return (len1 < len2) ? -1 : (len1 != len2);
10833}
10834
Alexander Belopolsky40018472011-02-26 01:02:56 +000010835int
10836PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10839 if (PyUnicode_READY(left) == -1 ||
10840 PyUnicode_READY(right) == -1)
10841 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010842 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010844 PyErr_Format(PyExc_TypeError,
10845 "Can't compare %.100s and %.100s",
10846 left->ob_type->tp_name,
10847 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848 return -1;
10849}
10850
Martin v. Löwis5b222132007-06-10 09:51:05 +000010851int
10852PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10853{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854 Py_ssize_t i;
10855 int kind;
10856 void *data;
10857 Py_UCS4 chr;
10858
Victor Stinner910337b2011-10-03 03:20:16 +020010859 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010860 if (PyUnicode_READY(uni) == -1)
10861 return -1;
10862 kind = PyUnicode_KIND(uni);
10863 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010864 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10866 if (chr != str[i])
10867 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010868 /* This check keeps Python strings that end in '\0' from comparing equal
10869 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010871 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010872 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010873 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010874 return 0;
10875}
10876
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010877
Benjamin Peterson29060642009-01-31 22:14:21 +000010878#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010879 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010880
Alexander Belopolsky40018472011-02-26 01:02:56 +000010881PyObject *
10882PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010883{
10884 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010885
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010886 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10887 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010888 if (PyUnicode_READY(left) == -1 ||
10889 PyUnicode_READY(right) == -1)
10890 return NULL;
10891 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10892 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010893 if (op == Py_EQ) {
10894 Py_INCREF(Py_False);
10895 return Py_False;
10896 }
10897 if (op == Py_NE) {
10898 Py_INCREF(Py_True);
10899 return Py_True;
10900 }
10901 }
10902 if (left == right)
10903 result = 0;
10904 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010905 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010906
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010907 /* Convert the return value to a Boolean */
10908 switch (op) {
10909 case Py_EQ:
10910 v = TEST_COND(result == 0);
10911 break;
10912 case Py_NE:
10913 v = TEST_COND(result != 0);
10914 break;
10915 case Py_LE:
10916 v = TEST_COND(result <= 0);
10917 break;
10918 case Py_GE:
10919 v = TEST_COND(result >= 0);
10920 break;
10921 case Py_LT:
10922 v = TEST_COND(result == -1);
10923 break;
10924 case Py_GT:
10925 v = TEST_COND(result == 1);
10926 break;
10927 default:
10928 PyErr_BadArgument();
10929 return NULL;
10930 }
10931 Py_INCREF(v);
10932 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010933 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010934
Brian Curtindfc80e32011-08-10 20:28:54 -050010935 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010936}
10937
Alexander Belopolsky40018472011-02-26 01:02:56 +000010938int
10939PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010940{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010941 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010942 int kind1, kind2, kind;
10943 void *buf1, *buf2;
10944 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010945 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010946
10947 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010948 sub = PyUnicode_FromObject(element);
10949 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010950 PyErr_Format(PyExc_TypeError,
10951 "'in <string>' requires string as left operand, not %s",
10952 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010953 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010954 }
10955
Thomas Wouters477c8d52006-05-27 19:21:47 +000010956 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010957 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010958 Py_DECREF(sub);
10959 return -1;
10960 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010961 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10962 Py_DECREF(sub);
10963 Py_DECREF(str);
10964 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010965
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 kind1 = PyUnicode_KIND(str);
10967 kind2 = PyUnicode_KIND(sub);
10968 kind = kind1 > kind2 ? kind1 : kind2;
10969 buf1 = PyUnicode_DATA(str);
10970 buf2 = PyUnicode_DATA(sub);
10971 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010972 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010973 if (!buf1) {
10974 Py_DECREF(sub);
10975 return -1;
10976 }
10977 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010978 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979 if (!buf2) {
10980 Py_DECREF(sub);
10981 if (kind1 != kind) PyMem_Free(buf1);
10982 return -1;
10983 }
10984 len1 = PyUnicode_GET_LENGTH(str);
10985 len2 = PyUnicode_GET_LENGTH(sub);
10986
Benjamin Petersonead6b532011-12-20 17:23:42 -060010987 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 case PyUnicode_1BYTE_KIND:
10989 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10990 break;
10991 case PyUnicode_2BYTE_KIND:
10992 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10993 break;
10994 case PyUnicode_4BYTE_KIND:
10995 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10996 break;
10997 default:
10998 result = -1;
10999 assert(0);
11000 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011001
11002 Py_DECREF(str);
11003 Py_DECREF(sub);
11004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 if (kind1 != kind)
11006 PyMem_Free(buf1);
11007 if (kind2 != kind)
11008 PyMem_Free(buf2);
11009
Guido van Rossum403d68b2000-03-13 15:55:09 +000011010 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011011}
11012
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013/* Concat to string or Unicode object giving a new Unicode object. */
11014
Alexander Belopolsky40018472011-02-26 01:02:56 +000011015PyObject *
11016PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011018 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020011019 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010011020 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021
11022 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011025 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011028 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029
11030 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011031 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011032 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011035 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011036 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011038 }
11039
Victor Stinner488fa492011-12-12 00:01:39 +010011040 u_len = PyUnicode_GET_LENGTH(u);
11041 v_len = PyUnicode_GET_LENGTH(v);
11042 if (u_len > PY_SSIZE_T_MAX - v_len) {
11043 PyErr_SetString(PyExc_OverflowError,
11044 "strings are too large to concat");
11045 goto onError;
11046 }
11047 new_len = u_len + v_len;
11048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011049 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011050 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020011051 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011054 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011056 goto onError;
Victor Stinner488fa492011-12-12 00:01:39 +010011057 copy_characters(w, 0, u, 0, u_len);
11058 copy_characters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059 Py_DECREF(u);
11060 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011061 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011062 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063
Benjamin Peterson29060642009-01-31 22:14:21 +000011064 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065 Py_XDECREF(u);
11066 Py_XDECREF(v);
11067 return NULL;
11068}
11069
Walter Dörwald1ab83302007-05-18 17:15:44 +000011070void
Victor Stinner23e56682011-10-03 03:54:37 +020011071PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011072{
Victor Stinner23e56682011-10-03 03:54:37 +020011073 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011074 Py_UCS4 maxchar, maxchar2;
11075 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011076
11077 if (p_left == NULL) {
11078 if (!PyErr_Occurred())
11079 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011080 return;
11081 }
Victor Stinner23e56682011-10-03 03:54:37 +020011082 left = *p_left;
11083 if (right == NULL || !PyUnicode_Check(left)) {
11084 if (!PyErr_Occurred())
11085 PyErr_BadInternalCall();
11086 goto error;
11087 }
11088
Benjamin Petersonbac79492012-01-14 13:34:47 -050011089 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011090 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011091 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011092 goto error;
11093
Victor Stinner488fa492011-12-12 00:01:39 +010011094 /* Shortcuts */
11095 if (left == unicode_empty) {
11096 Py_DECREF(left);
11097 Py_INCREF(right);
11098 *p_left = right;
11099 return;
11100 }
11101 if (right == unicode_empty)
11102 return;
11103
11104 left_len = PyUnicode_GET_LENGTH(left);
11105 right_len = PyUnicode_GET_LENGTH(right);
11106 if (left_len > PY_SSIZE_T_MAX - right_len) {
11107 PyErr_SetString(PyExc_OverflowError,
11108 "strings are too large to concat");
11109 goto error;
11110 }
11111 new_len = left_len + right_len;
11112
11113 if (unicode_modifiable(left)
11114 && PyUnicode_CheckExact(right)
11115 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011116 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11117 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011118 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011119 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011120 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11121 {
11122 /* append inplace */
11123 if (unicode_resize(p_left, new_len) != 0) {
11124 /* XXX if _PyUnicode_Resize() fails, 'left' has been
11125 * deallocated so it cannot be put back into
11126 * 'variable'. The MemoryError is raised when there
11127 * is no value in 'variable', which might (very
11128 * remotely) be a cause of incompatibilities.
11129 */
11130 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020011131 }
Victor Stinner488fa492011-12-12 00:01:39 +010011132 /* copy 'right' into the newly allocated area of 'left' */
11133 copy_characters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011134 }
Victor Stinner488fa492011-12-12 00:01:39 +010011135 else {
11136 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11137 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020011138 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011139
Victor Stinner488fa492011-12-12 00:01:39 +010011140 /* Concat the two Unicode strings */
11141 res = PyUnicode_New(new_len, maxchar);
11142 if (res == NULL)
11143 goto error;
11144 copy_characters(res, 0, left, 0, left_len);
11145 copy_characters(res, left_len, right, 0, right_len);
11146 Py_DECREF(left);
11147 *p_left = res;
11148 }
11149 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011150 return;
11151
11152error:
Victor Stinner488fa492011-12-12 00:01:39 +010011153 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011154}
11155
11156void
11157PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11158{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011159 PyUnicode_Append(pleft, right);
11160 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011161}
11162
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011163PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011164 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011165\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011166Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011167string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011168interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169
11170static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011171unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011173 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011174 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011175 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 int kind1, kind2, kind;
11178 void *buf1, *buf2;
11179 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180
Jesus Ceaac451502011-04-20 17:09:23 +020011181 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11182 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011183 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011185 kind1 = PyUnicode_KIND(self);
11186 kind2 = PyUnicode_KIND(substring);
11187 kind = kind1 > kind2 ? kind1 : kind2;
11188 buf1 = PyUnicode_DATA(self);
11189 buf2 = PyUnicode_DATA(substring);
11190 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011191 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 if (!buf1) {
11193 Py_DECREF(substring);
11194 return NULL;
11195 }
11196 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011197 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011198 if (!buf2) {
11199 Py_DECREF(substring);
11200 if (kind1 != kind) PyMem_Free(buf1);
11201 return NULL;
11202 }
11203 len1 = PyUnicode_GET_LENGTH(self);
11204 len2 = PyUnicode_GET_LENGTH(substring);
11205
11206 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011207 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 case PyUnicode_1BYTE_KIND:
11209 iresult = ucs1lib_count(
11210 ((Py_UCS1*)buf1) + start, end - start,
11211 buf2, len2, PY_SSIZE_T_MAX
11212 );
11213 break;
11214 case PyUnicode_2BYTE_KIND:
11215 iresult = ucs2lib_count(
11216 ((Py_UCS2*)buf1) + start, end - start,
11217 buf2, len2, PY_SSIZE_T_MAX
11218 );
11219 break;
11220 case PyUnicode_4BYTE_KIND:
11221 iresult = ucs4lib_count(
11222 ((Py_UCS4*)buf1) + start, end - start,
11223 buf2, len2, PY_SSIZE_T_MAX
11224 );
11225 break;
11226 default:
11227 assert(0); iresult = 0;
11228 }
11229
11230 result = PyLong_FromSsize_t(iresult);
11231
11232 if (kind1 != kind)
11233 PyMem_Free(buf1);
11234 if (kind2 != kind)
11235 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236
11237 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011238
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239 return result;
11240}
11241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011242PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011243 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011245Encode S using the codec registered for encoding. Default encoding\n\
11246is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011247handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011248a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11249'xmlcharrefreplace' as well as any other name registered with\n\
11250codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251
11252static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011253unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011255 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256 char *encoding = NULL;
11257 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011258
Benjamin Peterson308d6372009-09-18 21:42:35 +000011259 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11260 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011262 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011263}
11264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011265PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011266 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267\n\
11268Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011269If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270
11271static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011272unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011274 Py_ssize_t i, j, line_pos, src_len, incr;
11275 Py_UCS4 ch;
11276 PyObject *u;
11277 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011279 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011280 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
11282 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284
Antoine Pitrou22425222011-10-04 19:10:51 +020011285 if (PyUnicode_READY(self) == -1)
11286 return NULL;
11287
Thomas Wouters7e474022000-07-16 12:04:32 +000011288 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011289 src_len = PyUnicode_GET_LENGTH(self);
11290 i = j = line_pos = 0;
11291 kind = PyUnicode_KIND(self);
11292 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011293 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011294 for (; i < src_len; i++) {
11295 ch = PyUnicode_READ(kind, src_data, i);
11296 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011297 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011298 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011299 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011300 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011301 goto overflow;
11302 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011303 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011304 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011307 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011308 goto overflow;
11309 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011311 if (ch == '\n' || ch == '\r')
11312 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011314 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011315 if (!found)
11316 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011317
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011319 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320 if (!u)
11321 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011322 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323
Antoine Pitroue71d5742011-10-04 15:55:09 +020011324 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325
Antoine Pitroue71d5742011-10-04 15:55:09 +020011326 for (; i < src_len; i++) {
11327 ch = PyUnicode_READ(kind, src_data, i);
11328 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011329 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011330 incr = tabsize - (line_pos % tabsize);
11331 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011332 FILL(kind, dest_data, ' ', j, incr);
11333 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011334 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011335 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011336 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011337 line_pos++;
11338 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011339 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011340 if (ch == '\n' || ch == '\r')
11341 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011343 }
11344 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011345 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011346
Antoine Pitroue71d5742011-10-04 15:55:09 +020011347 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011348 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11349 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350}
11351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011352PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011353 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354\n\
11355Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011356such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357arguments start and end are interpreted as in slice notation.\n\
11358\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011359Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360
11361static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011362unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011364 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011365 Py_ssize_t start;
11366 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011367 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368
Jesus Ceaac451502011-04-20 17:09:23 +020011369 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11370 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 if (PyUnicode_READY(self) == -1)
11374 return NULL;
11375 if (PyUnicode_READY(substring) == -1)
11376 return NULL;
11377
Victor Stinner7931d9a2011-11-04 00:22:48 +010011378 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011379
11380 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 if (result == -2)
11383 return NULL;
11384
Christian Heimes217cfd12007-12-02 14:31:20 +000011385 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386}
11387
11388static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011389unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011391 void *data;
11392 enum PyUnicode_Kind kind;
11393 Py_UCS4 ch;
11394 PyObject *res;
11395
11396 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11397 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011399 }
11400 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11401 PyErr_SetString(PyExc_IndexError, "string index out of range");
11402 return NULL;
11403 }
11404 kind = PyUnicode_KIND(self);
11405 data = PyUnicode_DATA(self);
11406 ch = PyUnicode_READ(kind, data, index);
11407 if (ch < 256)
11408 return get_latin1_char(ch);
11409
11410 res = PyUnicode_New(1, ch);
11411 if (res == NULL)
11412 return NULL;
11413 kind = PyUnicode_KIND(res);
11414 data = PyUnicode_DATA(res);
11415 PyUnicode_WRITE(kind, data, 0, ch);
11416 assert(_PyUnicode_CheckConsistency(res, 1));
11417 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418}
11419
Guido van Rossumc2504932007-09-18 19:42:40 +000011420/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011421 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011422static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011423unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424{
Guido van Rossumc2504932007-09-18 19:42:40 +000011425 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011426 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011427
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011428#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011429 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011430#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 if (_PyUnicode_HASH(self) != -1)
11432 return _PyUnicode_HASH(self);
11433 if (PyUnicode_READY(self) == -1)
11434 return -1;
11435 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011436 /*
11437 We make the hash of the empty string be 0, rather than using
11438 (prefix ^ suffix), since this slightly obfuscates the hash secret
11439 */
11440 if (len == 0) {
11441 _PyUnicode_HASH(self) = 0;
11442 return 0;
11443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444
11445 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011446#define HASH(P) \
11447 x ^= (Py_uhash_t) *P << 7; \
11448 while (--len >= 0) \
11449 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450
Georg Brandl2fb477c2012-02-21 00:33:36 +010011451 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 switch (PyUnicode_KIND(self)) {
11453 case PyUnicode_1BYTE_KIND: {
11454 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11455 HASH(c);
11456 break;
11457 }
11458 case PyUnicode_2BYTE_KIND: {
11459 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11460 HASH(s);
11461 break;
11462 }
11463 default: {
11464 Py_UCS4 *l;
11465 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11466 "Impossible switch case in unicode_hash");
11467 l = PyUnicode_4BYTE_DATA(self);
11468 HASH(l);
11469 break;
11470 }
11471 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011472 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11473 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474
Guido van Rossumc2504932007-09-18 19:42:40 +000011475 if (x == -1)
11476 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011478 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011479}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011480#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011482PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011483 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011485Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486
11487static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011488unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011490 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011491 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011492 Py_ssize_t start;
11493 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494
Jesus Ceaac451502011-04-20 17:09:23 +020011495 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11496 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499 if (PyUnicode_READY(self) == -1)
11500 return NULL;
11501 if (PyUnicode_READY(substring) == -1)
11502 return NULL;
11503
Victor Stinner7931d9a2011-11-04 00:22:48 +010011504 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505
11506 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508 if (result == -2)
11509 return NULL;
11510
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511 if (result < 0) {
11512 PyErr_SetString(PyExc_ValueError, "substring not found");
11513 return NULL;
11514 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011515
Christian Heimes217cfd12007-12-02 14:31:20 +000011516 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517}
11518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011519PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011520 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011522Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011523at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524
11525static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011526unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 Py_ssize_t i, length;
11529 int kind;
11530 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531 int cased;
11532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 if (PyUnicode_READY(self) == -1)
11534 return NULL;
11535 length = PyUnicode_GET_LENGTH(self);
11536 kind = PyUnicode_KIND(self);
11537 data = PyUnicode_DATA(self);
11538
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011540 if (length == 1)
11541 return PyBool_FromLong(
11542 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011544 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011545 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011546 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011547
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011549 for (i = 0; i < length; i++) {
11550 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011551
Benjamin Peterson29060642009-01-31 22:14:21 +000011552 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11553 return PyBool_FromLong(0);
11554 else if (!cased && Py_UNICODE_ISLOWER(ch))
11555 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011557 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558}
11559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011560PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011561 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011563Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011564at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565
11566static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011567unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 Py_ssize_t i, length;
11570 int kind;
11571 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572 int cased;
11573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 if (PyUnicode_READY(self) == -1)
11575 return NULL;
11576 length = PyUnicode_GET_LENGTH(self);
11577 kind = PyUnicode_KIND(self);
11578 data = PyUnicode_DATA(self);
11579
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011581 if (length == 1)
11582 return PyBool_FromLong(
11583 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011585 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011587 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011588
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590 for (i = 0; i < length; i++) {
11591 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011592
Benjamin Peterson29060642009-01-31 22:14:21 +000011593 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11594 return PyBool_FromLong(0);
11595 else if (!cased && Py_UNICODE_ISUPPER(ch))
11596 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011597 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011598 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599}
11600
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011601PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011602 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011604Return True if S is a titlecased string and there is at least one\n\
11605character in S, i.e. upper- and titlecase characters may only\n\
11606follow uncased characters and lowercase characters only cased ones.\n\
11607Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608
11609static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011610unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011612 Py_ssize_t i, length;
11613 int kind;
11614 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615 int cased, previous_is_cased;
11616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 if (PyUnicode_READY(self) == -1)
11618 return NULL;
11619 length = PyUnicode_GET_LENGTH(self);
11620 kind = PyUnicode_KIND(self);
11621 data = PyUnicode_DATA(self);
11622
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624 if (length == 1) {
11625 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11626 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11627 (Py_UNICODE_ISUPPER(ch) != 0));
11628 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011630 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011632 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011633
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634 cased = 0;
11635 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636 for (i = 0; i < length; i++) {
11637 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011638
Benjamin Peterson29060642009-01-31 22:14:21 +000011639 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11640 if (previous_is_cased)
11641 return PyBool_FromLong(0);
11642 previous_is_cased = 1;
11643 cased = 1;
11644 }
11645 else if (Py_UNICODE_ISLOWER(ch)) {
11646 if (!previous_is_cased)
11647 return PyBool_FromLong(0);
11648 previous_is_cased = 1;
11649 cased = 1;
11650 }
11651 else
11652 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011654 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655}
11656
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011657PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011658 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011660Return True if all characters in S are whitespace\n\
11661and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662
11663static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011664unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666 Py_ssize_t i, length;
11667 int kind;
11668 void *data;
11669
11670 if (PyUnicode_READY(self) == -1)
11671 return NULL;
11672 length = PyUnicode_GET_LENGTH(self);
11673 kind = PyUnicode_KIND(self);
11674 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 if (length == 1)
11678 return PyBool_FromLong(
11679 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011681 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011683 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 for (i = 0; i < length; i++) {
11686 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011687 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011688 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011690 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691}
11692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011693PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011694 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011695\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011696Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011697and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011698
11699static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011700unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011701{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011702 Py_ssize_t i, length;
11703 int kind;
11704 void *data;
11705
11706 if (PyUnicode_READY(self) == -1)
11707 return NULL;
11708 length = PyUnicode_GET_LENGTH(self);
11709 kind = PyUnicode_KIND(self);
11710 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011711
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011712 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011713 if (length == 1)
11714 return PyBool_FromLong(
11715 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011716
11717 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011718 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011719 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 for (i = 0; i < length; i++) {
11722 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011723 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011724 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011725 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011726}
11727
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011728PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011729 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011730\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011731Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011732and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011733
11734static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011735unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011737 int kind;
11738 void *data;
11739 Py_ssize_t len, i;
11740
11741 if (PyUnicode_READY(self) == -1)
11742 return NULL;
11743
11744 kind = PyUnicode_KIND(self);
11745 data = PyUnicode_DATA(self);
11746 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011747
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011748 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 if (len == 1) {
11750 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11751 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11752 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011753
11754 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011756 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 for (i = 0; i < len; i++) {
11759 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011760 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011761 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011762 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011763 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011764}
11765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011766PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011767 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011769Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011770False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771
11772static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011773unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 Py_ssize_t i, length;
11776 int kind;
11777 void *data;
11778
11779 if (PyUnicode_READY(self) == -1)
11780 return NULL;
11781 length = PyUnicode_GET_LENGTH(self);
11782 kind = PyUnicode_KIND(self);
11783 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 if (length == 1)
11787 return PyBool_FromLong(
11788 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011790 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011792 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 for (i = 0; i < length; i++) {
11795 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011796 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011798 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799}
11800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011801PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011802 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011804Return True if all characters in S are digits\n\
11805and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806
11807static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011808unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 Py_ssize_t i, length;
11811 int kind;
11812 void *data;
11813
11814 if (PyUnicode_READY(self) == -1)
11815 return NULL;
11816 length = PyUnicode_GET_LENGTH(self);
11817 kind = PyUnicode_KIND(self);
11818 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 if (length == 1) {
11822 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11823 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11824 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011826 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011828 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 for (i = 0; i < length; i++) {
11831 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011832 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011834 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835}
11836
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011837PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011838 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011840Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011841False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842
11843static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011844unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011846 Py_ssize_t i, length;
11847 int kind;
11848 void *data;
11849
11850 if (PyUnicode_READY(self) == -1)
11851 return NULL;
11852 length = PyUnicode_GET_LENGTH(self);
11853 kind = PyUnicode_KIND(self);
11854 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855
Guido van Rossumd57fd912000-03-10 22:53:23 +000011856 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 if (length == 1)
11858 return PyBool_FromLong(
11859 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011861 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011863 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 for (i = 0; i < length; i++) {
11866 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011867 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011869 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870}
11871
Martin v. Löwis47383402007-08-15 07:32:56 +000011872int
11873PyUnicode_IsIdentifier(PyObject *self)
11874{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 int kind;
11876 void *data;
11877 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011878 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011880 if (PyUnicode_READY(self) == -1) {
11881 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011882 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883 }
11884
11885 /* Special case for empty strings */
11886 if (PyUnicode_GET_LENGTH(self) == 0)
11887 return 0;
11888 kind = PyUnicode_KIND(self);
11889 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011890
11891 /* PEP 3131 says that the first character must be in
11892 XID_Start and subsequent characters in XID_Continue,
11893 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011894 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011895 letters, digits, underscore). However, given the current
11896 definition of XID_Start and XID_Continue, it is sufficient
11897 to check just for these, except that _ must be allowed
11898 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011900 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011901 return 0;
11902
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011903 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011904 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011905 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011906 return 1;
11907}
11908
11909PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011910 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011911\n\
11912Return True if S is a valid identifier according\n\
11913to the language definition.");
11914
11915static PyObject*
11916unicode_isidentifier(PyObject *self)
11917{
11918 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11919}
11920
Georg Brandl559e5d72008-06-11 18:37:52 +000011921PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011922 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011923\n\
11924Return True if all characters in S are considered\n\
11925printable in repr() or S is empty, False otherwise.");
11926
11927static PyObject*
11928unicode_isprintable(PyObject *self)
11929{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 Py_ssize_t i, length;
11931 int kind;
11932 void *data;
11933
11934 if (PyUnicode_READY(self) == -1)
11935 return NULL;
11936 length = PyUnicode_GET_LENGTH(self);
11937 kind = PyUnicode_KIND(self);
11938 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011939
11940 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011941 if (length == 1)
11942 return PyBool_FromLong(
11943 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 for (i = 0; i < length; i++) {
11946 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011947 Py_RETURN_FALSE;
11948 }
11949 }
11950 Py_RETURN_TRUE;
11951}
11952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011953PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011954 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955\n\
11956Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011957iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958
11959static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011960unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011962 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963}
11964
Martin v. Löwis18e16552006-02-15 17:27:45 +000011965static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011966unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011968 if (PyUnicode_READY(self) == -1)
11969 return -1;
11970 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971}
11972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011973PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011974 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011976Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011977done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978
11979static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011980unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011982 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 Py_UCS4 fillchar = ' ';
11984
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011985 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986 return NULL;
11987
Benjamin Petersonbac79492012-01-14 13:34:47 -050011988 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011989 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990
Victor Stinnerc4b49542011-12-11 22:44:26 +010011991 if (PyUnicode_GET_LENGTH(self) >= width)
11992 return unicode_result_unchanged(self);
11993
11994 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995}
11996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011997PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011998 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012000Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001
12002static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012003unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012005 if (PyUnicode_READY(self) == -1)
12006 return NULL;
12007 if (PyUnicode_IS_ASCII(self))
12008 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012009 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010}
12011
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012012#define LEFTSTRIP 0
12013#define RIGHTSTRIP 1
12014#define BOTHSTRIP 2
12015
12016/* Arrays indexed by above */
12017static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
12018
12019#define STRIPNAME(i) (stripformat[i]+3)
12020
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012021/* externally visible for str.strip(unicode) */
12022PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012023_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012024{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025 void *data;
12026 int kind;
12027 Py_ssize_t i, j, len;
12028 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12031 return NULL;
12032
12033 kind = PyUnicode_KIND(self);
12034 data = PyUnicode_DATA(self);
12035 len = PyUnicode_GET_LENGTH(self);
12036 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12037 PyUnicode_DATA(sepobj),
12038 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000012039
Benjamin Peterson14339b62009-01-31 16:36:08 +000012040 i = 0;
12041 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 while (i < len &&
12043 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012044 i++;
12045 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012046 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012047
Benjamin Peterson14339b62009-01-31 16:36:08 +000012048 j = len;
12049 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012050 do {
12051 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 } while (j >= i &&
12053 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000012054 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012055 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012056
Victor Stinner7931d9a2011-11-04 00:22:48 +010012057 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058}
12059
12060PyObject*
12061PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12062{
12063 unsigned char *data;
12064 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012065 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066
Victor Stinnerde636f32011-10-01 03:55:54 +020012067 if (PyUnicode_READY(self) == -1)
12068 return NULL;
12069
Victor Stinner684d5fd2012-05-03 02:32:34 +020012070 length = PyUnicode_GET_LENGTH(self);
12071 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012072
Victor Stinner684d5fd2012-05-03 02:32:34 +020012073 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012074 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012075
Victor Stinnerde636f32011-10-01 03:55:54 +020012076 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012077 PyErr_SetString(PyExc_IndexError, "string index out of range");
12078 return NULL;
12079 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020012080 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020012081 Py_INCREF(unicode_empty);
12082 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020012083 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012084
Victor Stinner684d5fd2012-05-03 02:32:34 +020012085 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012086 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012087 data = PyUnicode_1BYTE_DATA(self);
12088 return unicode_fromascii(data + start, length);
12089 }
12090 else {
12091 kind = PyUnicode_KIND(self);
12092 data = PyUnicode_1BYTE_DATA(self);
12093 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012094 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012095 length);
12096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012097}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098
12099static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012100do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 int kind;
12103 void *data;
12104 Py_ssize_t len, i, j;
12105
12106 if (PyUnicode_READY(self) == -1)
12107 return NULL;
12108
12109 kind = PyUnicode_KIND(self);
12110 data = PyUnicode_DATA(self);
12111 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012112
Benjamin Peterson14339b62009-01-31 16:36:08 +000012113 i = 0;
12114 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012116 i++;
12117 }
12118 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012119
Benjamin Peterson14339b62009-01-31 16:36:08 +000012120 j = len;
12121 if (striptype != LEFTSTRIP) {
12122 do {
12123 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012125 j++;
12126 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012127
Victor Stinner7931d9a2011-11-04 00:22:48 +010012128 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129}
12130
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012131
12132static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012133do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012134{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012135 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012136
Benjamin Peterson14339b62009-01-31 16:36:08 +000012137 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
12138 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012139
Benjamin Peterson14339b62009-01-31 16:36:08 +000012140 if (sep != NULL && sep != Py_None) {
12141 if (PyUnicode_Check(sep))
12142 return _PyUnicode_XStrip(self, striptype, sep);
12143 else {
12144 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012145 "%s arg must be None or str",
12146 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012147 return NULL;
12148 }
12149 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012150
Benjamin Peterson14339b62009-01-31 16:36:08 +000012151 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012152}
12153
12154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012155PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012156 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012157\n\
12158Return a copy of the string S with leading and trailing\n\
12159whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012160If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012161
12162static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012163unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012164{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012165 if (PyTuple_GET_SIZE(args) == 0)
12166 return do_strip(self, BOTHSTRIP); /* Common case */
12167 else
12168 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012169}
12170
12171
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012172PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012173 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012174\n\
12175Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012176If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012177
12178static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012179unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012180{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012181 if (PyTuple_GET_SIZE(args) == 0)
12182 return do_strip(self, LEFTSTRIP); /* Common case */
12183 else
12184 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012185}
12186
12187
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012188PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012189 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012190\n\
12191Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012192If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012193
12194static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012195unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012196{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012197 if (PyTuple_GET_SIZE(args) == 0)
12198 return do_strip(self, RIGHTSTRIP); /* Common case */
12199 else
12200 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012201}
12202
12203
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012205unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012207 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209
Georg Brandl222de0f2009-04-12 12:01:50 +000012210 if (len < 1) {
12211 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020012212 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000012213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214
Victor Stinnerc4b49542011-12-11 22:44:26 +010012215 /* no repeat, return original string */
12216 if (len == 1)
12217 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012218
Benjamin Petersonbac79492012-01-14 13:34:47 -050012219 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 return NULL;
12221
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012222 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012223 PyErr_SetString(PyExc_OverflowError,
12224 "repeated string is too long");
12225 return NULL;
12226 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012227 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012228
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012229 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230 if (!u)
12231 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012232 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 if (PyUnicode_GET_LENGTH(str) == 1) {
12235 const int kind = PyUnicode_KIND(str);
12236 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012237 if (kind == PyUnicode_1BYTE_KIND) {
12238 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012239 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012240 }
12241 else if (kind == PyUnicode_2BYTE_KIND) {
12242 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012243 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012244 ucs2[n] = fill_char;
12245 } else {
12246 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12247 assert(kind == PyUnicode_4BYTE_KIND);
12248 for (n = 0; n < len; ++n)
12249 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012250 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012251 }
12252 else {
12253 /* number of characters copied this far */
12254 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012255 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012256 char *to = (char *) PyUnicode_DATA(u);
12257 Py_MEMCPY(to, PyUnicode_DATA(str),
12258 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012259 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012260 n = (done <= nchars-done) ? done : nchars-done;
12261 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012262 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264 }
12265
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012266 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012267 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268}
12269
Alexander Belopolsky40018472011-02-26 01:02:56 +000012270PyObject *
12271PyUnicode_Replace(PyObject *obj,
12272 PyObject *subobj,
12273 PyObject *replobj,
12274 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275{
12276 PyObject *self;
12277 PyObject *str1;
12278 PyObject *str2;
12279 PyObject *result;
12280
12281 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012282 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012283 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012285 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012286 Py_DECREF(self);
12287 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288 }
12289 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012290 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012291 Py_DECREF(self);
12292 Py_DECREF(str1);
12293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012294 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012295 if (PyUnicode_READY(self) == -1 ||
12296 PyUnicode_READY(str1) == -1 ||
12297 PyUnicode_READY(str2) == -1)
12298 result = NULL;
12299 else
12300 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301 Py_DECREF(self);
12302 Py_DECREF(str1);
12303 Py_DECREF(str2);
12304 return result;
12305}
12306
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012307PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012308 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309\n\
12310Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012311old replaced by new. If the optional argument count is\n\
12312given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012313
12314static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012316{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317 PyObject *str1;
12318 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012319 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320 PyObject *result;
12321
Martin v. Löwis18e16552006-02-15 17:27:45 +000012322 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012324 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012325 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012327 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328 return NULL;
12329 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012330 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012331 Py_DECREF(str1);
12332 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012333 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012334 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12335 result = NULL;
12336 else
12337 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012338
12339 Py_DECREF(str1);
12340 Py_DECREF(str2);
12341 return result;
12342}
12343
Alexander Belopolsky40018472011-02-26 01:02:56 +000012344static PyObject *
12345unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012346{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012347 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348 Py_ssize_t isize;
12349 Py_ssize_t osize, squote, dquote, i, o;
12350 Py_UCS4 max, quote;
12351 int ikind, okind;
12352 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012355 return NULL;
12356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 isize = PyUnicode_GET_LENGTH(unicode);
12358 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360 /* Compute length of output, quote characters, and
12361 maximum character */
12362 osize = 2; /* quotes */
12363 max = 127;
12364 squote = dquote = 0;
12365 ikind = PyUnicode_KIND(unicode);
12366 for (i = 0; i < isize; i++) {
12367 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12368 switch (ch) {
12369 case '\'': squote++; osize++; break;
12370 case '"': dquote++; osize++; break;
12371 case '\\': case '\t': case '\r': case '\n':
12372 osize += 2; break;
12373 default:
12374 /* Fast-path ASCII */
12375 if (ch < ' ' || ch == 0x7f)
12376 osize += 4; /* \xHH */
12377 else if (ch < 0x7f)
12378 osize++;
12379 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12380 osize++;
12381 max = ch > max ? ch : max;
12382 }
12383 else if (ch < 0x100)
12384 osize += 4; /* \xHH */
12385 else if (ch < 0x10000)
12386 osize += 6; /* \uHHHH */
12387 else
12388 osize += 10; /* \uHHHHHHHH */
12389 }
12390 }
12391
12392 quote = '\'';
12393 if (squote) {
12394 if (dquote)
12395 /* Both squote and dquote present. Use squote,
12396 and escape them */
12397 osize += squote;
12398 else
12399 quote = '"';
12400 }
12401
12402 repr = PyUnicode_New(osize, max);
12403 if (repr == NULL)
12404 return NULL;
12405 okind = PyUnicode_KIND(repr);
12406 odata = PyUnicode_DATA(repr);
12407
12408 PyUnicode_WRITE(okind, odata, 0, quote);
12409 PyUnicode_WRITE(okind, odata, osize-1, quote);
12410
12411 for (i = 0, o = 1; i < isize; i++) {
12412 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012413
12414 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415 if ((ch == quote) || (ch == '\\')) {
12416 PyUnicode_WRITE(okind, odata, o++, '\\');
12417 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012418 continue;
12419 }
12420
Benjamin Peterson29060642009-01-31 22:14:21 +000012421 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012422 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 PyUnicode_WRITE(okind, odata, o++, '\\');
12424 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012425 }
12426 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012427 PyUnicode_WRITE(okind, odata, o++, '\\');
12428 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012429 }
12430 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 PyUnicode_WRITE(okind, odata, o++, '\\');
12432 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012433 }
12434
12435 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012436 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 PyUnicode_WRITE(okind, odata, o++, '\\');
12438 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012439 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12440 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012441 }
12442
Georg Brandl559e5d72008-06-11 18:37:52 +000012443 /* Copy ASCII characters as-is */
12444 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012445 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012446 }
12447
Benjamin Peterson29060642009-01-31 22:14:21 +000012448 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012449 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012450 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012451 (categories Z* and C* except ASCII space)
12452 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012454 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012455 if (ch <= 0xff) {
12456 PyUnicode_WRITE(okind, odata, o++, '\\');
12457 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012458 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12459 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012460 }
12461 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462 else if (ch >= 0x10000) {
12463 PyUnicode_WRITE(okind, odata, o++, '\\');
12464 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012465 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12466 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12467 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12468 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12469 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12470 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12471 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12472 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012473 }
12474 /* Map 16-bit characters to '\uxxxx' */
12475 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 PyUnicode_WRITE(okind, odata, o++, '\\');
12477 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012478 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12479 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12480 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12481 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012482 }
12483 }
12484 /* Copy characters as-is */
12485 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012487 }
12488 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012489 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012491 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012492 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493}
12494
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012495PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012496 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497\n\
12498Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012499such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500arguments start and end are interpreted as in slice notation.\n\
12501\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012502Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503
12504static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012506{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012507 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012508 Py_ssize_t start;
12509 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012510 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511
Jesus Ceaac451502011-04-20 17:09:23 +020012512 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12513 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012514 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012516 if (PyUnicode_READY(self) == -1)
12517 return NULL;
12518 if (PyUnicode_READY(substring) == -1)
12519 return NULL;
12520
Victor Stinner7931d9a2011-11-04 00:22:48 +010012521 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522
12523 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012524
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012525 if (result == -2)
12526 return NULL;
12527
Christian Heimes217cfd12007-12-02 14:31:20 +000012528 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529}
12530
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012531PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012532 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012534Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535
12536static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012539 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012540 Py_ssize_t start;
12541 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012542 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543
Jesus Ceaac451502011-04-20 17:09:23 +020012544 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12545 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012546 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 if (PyUnicode_READY(self) == -1)
12549 return NULL;
12550 if (PyUnicode_READY(substring) == -1)
12551 return NULL;
12552
Victor Stinner7931d9a2011-11-04 00:22:48 +010012553 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554
12555 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557 if (result == -2)
12558 return NULL;
12559
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560 if (result < 0) {
12561 PyErr_SetString(PyExc_ValueError, "substring not found");
12562 return NULL;
12563 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012564
Christian Heimes217cfd12007-12-02 14:31:20 +000012565 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566}
12567
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012568PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012569 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012571Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012572done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573
12574static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012575unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012577 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 Py_UCS4 fillchar = ' ';
12579
Victor Stinnere9a29352011-10-01 02:14:59 +020012580 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012582
Benjamin Petersonbac79492012-01-14 13:34:47 -050012583 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584 return NULL;
12585
Victor Stinnerc4b49542011-12-11 22:44:26 +010012586 if (PyUnicode_GET_LENGTH(self) >= width)
12587 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588
Victor Stinnerc4b49542011-12-11 22:44:26 +010012589 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590}
12591
Alexander Belopolsky40018472011-02-26 01:02:56 +000012592PyObject *
12593PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594{
12595 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012596
Guido van Rossumd57fd912000-03-10 22:53:23 +000012597 s = PyUnicode_FromObject(s);
12598 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012599 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012600 if (sep != NULL) {
12601 sep = PyUnicode_FromObject(sep);
12602 if (sep == NULL) {
12603 Py_DECREF(s);
12604 return NULL;
12605 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012606 }
12607
Victor Stinner9310abb2011-10-05 00:59:23 +020012608 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609
12610 Py_DECREF(s);
12611 Py_XDECREF(sep);
12612 return result;
12613}
12614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012615PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012616 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617\n\
12618Return a list of the words in S, using sep as the\n\
12619delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012620splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012621whitespace string is a separator and empty strings are\n\
12622removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623
12624static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012625unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012627 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012629 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012631 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12632 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633 return NULL;
12634
12635 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012636 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012638 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012640 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012641}
12642
Thomas Wouters477c8d52006-05-27 19:21:47 +000012643PyObject *
12644PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12645{
12646 PyObject* str_obj;
12647 PyObject* sep_obj;
12648 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 int kind1, kind2, kind;
12650 void *buf1 = NULL, *buf2 = NULL;
12651 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012652
12653 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012654 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012655 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012656 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012657 if (!sep_obj) {
12658 Py_DECREF(str_obj);
12659 return NULL;
12660 }
12661 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12662 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012663 Py_DECREF(str_obj);
12664 return NULL;
12665 }
12666
Victor Stinner14f8f022011-10-05 20:58:25 +020012667 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012669 kind = Py_MAX(kind1, kind2);
12670 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012672 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 if (!buf1)
12674 goto onError;
12675 buf2 = PyUnicode_DATA(sep_obj);
12676 if (kind2 != kind)
12677 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12678 if (!buf2)
12679 goto onError;
12680 len1 = PyUnicode_GET_LENGTH(str_obj);
12681 len2 = PyUnicode_GET_LENGTH(sep_obj);
12682
Benjamin Petersonead6b532011-12-20 17:23:42 -060012683 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012685 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12686 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12687 else
12688 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012689 break;
12690 case PyUnicode_2BYTE_KIND:
12691 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12692 break;
12693 case PyUnicode_4BYTE_KIND:
12694 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12695 break;
12696 default:
12697 assert(0);
12698 out = 0;
12699 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012700
12701 Py_DECREF(sep_obj);
12702 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703 if (kind1 != kind)
12704 PyMem_Free(buf1);
12705 if (kind2 != kind)
12706 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012707
12708 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012709 onError:
12710 Py_DECREF(sep_obj);
12711 Py_DECREF(str_obj);
12712 if (kind1 != kind && buf1)
12713 PyMem_Free(buf1);
12714 if (kind2 != kind && buf2)
12715 PyMem_Free(buf2);
12716 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012717}
12718
12719
12720PyObject *
12721PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12722{
12723 PyObject* str_obj;
12724 PyObject* sep_obj;
12725 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 int kind1, kind2, kind;
12727 void *buf1 = NULL, *buf2 = NULL;
12728 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012729
12730 str_obj = PyUnicode_FromObject(str_in);
12731 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012732 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012733 sep_obj = PyUnicode_FromObject(sep_in);
12734 if (!sep_obj) {
12735 Py_DECREF(str_obj);
12736 return NULL;
12737 }
12738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 kind1 = PyUnicode_KIND(str_in);
12740 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012741 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012742 buf1 = PyUnicode_DATA(str_in);
12743 if (kind1 != kind)
12744 buf1 = _PyUnicode_AsKind(str_in, kind);
12745 if (!buf1)
12746 goto onError;
12747 buf2 = PyUnicode_DATA(sep_obj);
12748 if (kind2 != kind)
12749 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12750 if (!buf2)
12751 goto onError;
12752 len1 = PyUnicode_GET_LENGTH(str_obj);
12753 len2 = PyUnicode_GET_LENGTH(sep_obj);
12754
Benjamin Petersonead6b532011-12-20 17:23:42 -060012755 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012756 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012757 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12758 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12759 else
12760 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012761 break;
12762 case PyUnicode_2BYTE_KIND:
12763 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12764 break;
12765 case PyUnicode_4BYTE_KIND:
12766 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12767 break;
12768 default:
12769 assert(0);
12770 out = 0;
12771 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012772
12773 Py_DECREF(sep_obj);
12774 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012775 if (kind1 != kind)
12776 PyMem_Free(buf1);
12777 if (kind2 != kind)
12778 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012779
12780 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012781 onError:
12782 Py_DECREF(sep_obj);
12783 Py_DECREF(str_obj);
12784 if (kind1 != kind && buf1)
12785 PyMem_Free(buf1);
12786 if (kind2 != kind && buf2)
12787 PyMem_Free(buf2);
12788 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012789}
12790
12791PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012792 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012793\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012794Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012795the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012796found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012797
12798static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012799unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012800{
Victor Stinner9310abb2011-10-05 00:59:23 +020012801 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012802}
12803
12804PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012805 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012806\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012807Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012808the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012809separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012810
12811static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012812unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012813{
Victor Stinner9310abb2011-10-05 00:59:23 +020012814 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012815}
12816
Alexander Belopolsky40018472011-02-26 01:02:56 +000012817PyObject *
12818PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012819{
12820 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012821
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012822 s = PyUnicode_FromObject(s);
12823 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012824 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012825 if (sep != NULL) {
12826 sep = PyUnicode_FromObject(sep);
12827 if (sep == NULL) {
12828 Py_DECREF(s);
12829 return NULL;
12830 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012831 }
12832
Victor Stinner9310abb2011-10-05 00:59:23 +020012833 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012834
12835 Py_DECREF(s);
12836 Py_XDECREF(sep);
12837 return result;
12838}
12839
12840PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012841 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012842\n\
12843Return a list of the words in S, using sep as the\n\
12844delimiter string, starting at the end of the string and\n\
12845working to the front. If maxsplit is given, at most maxsplit\n\
12846splits are done. If sep is not specified, any whitespace string\n\
12847is a separator.");
12848
12849static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012850unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012851{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012852 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012853 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012854 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012855
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012856 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12857 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012858 return NULL;
12859
12860 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012861 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012862 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012863 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012864 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012865 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012866}
12867
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012868PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012869 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870\n\
12871Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012872Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012873is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874
12875static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012876unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012878 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012879 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012881 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12882 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883 return NULL;
12884
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012885 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886}
12887
12888static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012889PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012890{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012891 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892}
12893
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012894PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012895 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012896\n\
12897Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012898and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899
12900static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012901unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012903 if (PyUnicode_READY(self) == -1)
12904 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012905 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012906}
12907
Georg Brandlceee0772007-11-27 23:48:05 +000012908PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012909 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012910\n\
12911Return a translation table usable for str.translate().\n\
12912If there is only one argument, it must be a dictionary mapping Unicode\n\
12913ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012914Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012915If there are two arguments, they must be strings of equal length, and\n\
12916in the resulting dictionary, each character in x will be mapped to the\n\
12917character at the same position in y. If there is a third argument, it\n\
12918must be a string, whose characters will be mapped to None in the result.");
12919
12920static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012921unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012922{
12923 PyObject *x, *y = NULL, *z = NULL;
12924 PyObject *new = NULL, *key, *value;
12925 Py_ssize_t i = 0;
12926 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012927
Georg Brandlceee0772007-11-27 23:48:05 +000012928 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12929 return NULL;
12930 new = PyDict_New();
12931 if (!new)
12932 return NULL;
12933 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012934 int x_kind, y_kind, z_kind;
12935 void *x_data, *y_data, *z_data;
12936
Georg Brandlceee0772007-11-27 23:48:05 +000012937 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012938 if (!PyUnicode_Check(x)) {
12939 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12940 "be a string if there is a second argument");
12941 goto err;
12942 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012943 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012944 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12945 "arguments must have equal length");
12946 goto err;
12947 }
12948 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012949 x_kind = PyUnicode_KIND(x);
12950 y_kind = PyUnicode_KIND(y);
12951 x_data = PyUnicode_DATA(x);
12952 y_data = PyUnicode_DATA(y);
12953 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12954 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012955 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012956 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012957 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012958 if (!value) {
12959 Py_DECREF(key);
12960 goto err;
12961 }
Georg Brandlceee0772007-11-27 23:48:05 +000012962 res = PyDict_SetItem(new, key, value);
12963 Py_DECREF(key);
12964 Py_DECREF(value);
12965 if (res < 0)
12966 goto err;
12967 }
12968 /* create entries for deleting chars in z */
12969 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012970 z_kind = PyUnicode_KIND(z);
12971 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012972 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012973 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012974 if (!key)
12975 goto err;
12976 res = PyDict_SetItem(new, key, Py_None);
12977 Py_DECREF(key);
12978 if (res < 0)
12979 goto err;
12980 }
12981 }
12982 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012983 int kind;
12984 void *data;
12985
Georg Brandlceee0772007-11-27 23:48:05 +000012986 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012987 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012988 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12989 "to maketrans it must be a dict");
12990 goto err;
12991 }
12992 /* copy entries into the new dict, converting string keys to int keys */
12993 while (PyDict_Next(x, &i, &key, &value)) {
12994 if (PyUnicode_Check(key)) {
12995 /* convert string keys to integer keys */
12996 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012997 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012998 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12999 "table must be of length 1");
13000 goto err;
13001 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013002 kind = PyUnicode_KIND(key);
13003 data = PyUnicode_DATA(key);
13004 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013005 if (!newkey)
13006 goto err;
13007 res = PyDict_SetItem(new, newkey, value);
13008 Py_DECREF(newkey);
13009 if (res < 0)
13010 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013011 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013012 /* just keep integer keys */
13013 if (PyDict_SetItem(new, key, value) < 0)
13014 goto err;
13015 } else {
13016 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13017 "be strings or integers");
13018 goto err;
13019 }
13020 }
13021 }
13022 return new;
13023 err:
13024 Py_DECREF(new);
13025 return NULL;
13026}
13027
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013028PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013029 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013030\n\
13031Return a copy of the string S, where all characters have been mapped\n\
13032through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000013033Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000013034Unmapped characters are left untouched. Characters mapped to None\n\
13035are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013036
13037static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013038unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013039{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013040 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013041}
13042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013043PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013044 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013045\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013046Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013047
13048static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013049unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013050{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013051 if (PyUnicode_READY(self) == -1)
13052 return NULL;
13053 if (PyUnicode_IS_ASCII(self))
13054 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013055 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013056}
13057
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013058PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013059 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013060\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013061Pad a numeric string S with zeros on the left, to fill a field\n\
13062of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013063
13064static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013065unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013067 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013068 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013069 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013070 int kind;
13071 void *data;
13072 Py_UCS4 chr;
13073
Martin v. Löwis18e16552006-02-15 17:27:45 +000013074 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013075 return NULL;
13076
Benjamin Petersonbac79492012-01-14 13:34:47 -050013077 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013078 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013079
Victor Stinnerc4b49542011-12-11 22:44:26 +010013080 if (PyUnicode_GET_LENGTH(self) >= width)
13081 return unicode_result_unchanged(self);
13082
13083 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013084
13085 u = pad(self, fill, 0, '0');
13086
Walter Dörwald068325e2002-04-15 13:36:47 +000013087 if (u == NULL)
13088 return NULL;
13089
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013090 kind = PyUnicode_KIND(u);
13091 data = PyUnicode_DATA(u);
13092 chr = PyUnicode_READ(kind, data, fill);
13093
13094 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013096 PyUnicode_WRITE(kind, data, 0, chr);
13097 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098 }
13099
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013100 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013101 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103
13104#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013105static PyObject *
13106unicode__decimal2ascii(PyObject *self)
13107{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013108 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013109}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110#endif
13111
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013112PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013113 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013115Return True if S starts with the specified prefix, False otherwise.\n\
13116With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013117With optional end, stop comparing S at that position.\n\
13118prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119
13120static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013121unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013122 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013124 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013125 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013126 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013127 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013128 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129
Jesus Ceaac451502011-04-20 17:09:23 +020013130 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013131 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013132 if (PyTuple_Check(subobj)) {
13133 Py_ssize_t i;
13134 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013135 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013136 if (substring == NULL)
13137 return NULL;
13138 result = tailmatch(self, substring, start, end, -1);
13139 Py_DECREF(substring);
13140 if (result) {
13141 Py_RETURN_TRUE;
13142 }
13143 }
13144 /* nothing matched */
13145 Py_RETURN_FALSE;
13146 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013147 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013148 if (substring == NULL) {
13149 if (PyErr_ExceptionMatches(PyExc_TypeError))
13150 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13151 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013152 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013153 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013154 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013156 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157}
13158
13159
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013160PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013161 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013162\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013163Return True if S ends with the specified suffix, False otherwise.\n\
13164With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013165With optional end, stop comparing S at that position.\n\
13166suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013167
13168static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013169unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013170 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013171{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013172 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013173 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013174 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013175 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013176 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013177
Jesus Ceaac451502011-04-20 17:09:23 +020013178 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013179 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013180 if (PyTuple_Check(subobj)) {
13181 Py_ssize_t i;
13182 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013183 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013184 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013185 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013186 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013187 result = tailmatch(self, substring, start, end, +1);
13188 Py_DECREF(substring);
13189 if (result) {
13190 Py_RETURN_TRUE;
13191 }
13192 }
13193 Py_RETURN_FALSE;
13194 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013195 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013196 if (substring == NULL) {
13197 if (PyErr_ExceptionMatches(PyExc_TypeError))
13198 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13199 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013200 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013201 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013202 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013203 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013204 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013205}
13206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013207#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013208
13209PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013210 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013211\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013212Return a formatted version of S, using substitutions from args and kwargs.\n\
13213The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013214
Eric Smith27bbca62010-11-04 17:06:58 +000013215PyDoc_STRVAR(format_map__doc__,
13216 "S.format_map(mapping) -> str\n\
13217\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013218Return a formatted version of S, using substitutions from mapping.\n\
13219The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013220
Eric Smith4a7d76d2008-05-30 18:10:19 +000013221static PyObject *
13222unicode__format__(PyObject* self, PyObject* args)
13223{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013224 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013225
13226 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13227 return NULL;
13228
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013229 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013230 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013231 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013232}
13233
Eric Smith8c663262007-08-25 02:26:07 +000013234PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013235 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013236\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013237Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013238
13239static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013240unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013241{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013242 Py_ssize_t size;
13243
13244 /* If it's a compact object, account for base structure +
13245 character data. */
13246 if (PyUnicode_IS_COMPACT_ASCII(v))
13247 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13248 else if (PyUnicode_IS_COMPACT(v))
13249 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013250 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013251 else {
13252 /* If it is a two-block object, account for base object, and
13253 for character block if present. */
13254 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013255 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013256 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013257 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013258 }
13259 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013260 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013261 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013262 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013263 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013264 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013265
13266 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013267}
13268
13269PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013270 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013271
13272static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013273unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013274{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013275 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013276 if (!copy)
13277 return NULL;
13278 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013279}
13280
Guido van Rossumd57fd912000-03-10 22:53:23 +000013281static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013282 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013283 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013284 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13285 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013286 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13287 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013288 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013289 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13290 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13291 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13292 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13293 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013294 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013295 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13296 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13297 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013298 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013299 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13300 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13301 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013302 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013303 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013304 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013305 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013306 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13307 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13308 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13309 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13310 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13311 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13312 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13313 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13314 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13315 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13316 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13317 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13318 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13319 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013320 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013321 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013322 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013323 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013324 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013325 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013326 {"maketrans", (PyCFunction) unicode_maketrans,
13327 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013328 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013329#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013330 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013331 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013332#endif
13333
Benjamin Peterson14339b62009-01-31 16:36:08 +000013334 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013335 {NULL, NULL}
13336};
13337
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013338static PyObject *
13339unicode_mod(PyObject *v, PyObject *w)
13340{
Brian Curtindfc80e32011-08-10 20:28:54 -050013341 if (!PyUnicode_Check(v))
13342 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013343 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013344}
13345
13346static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013347 0, /*nb_add*/
13348 0, /*nb_subtract*/
13349 0, /*nb_multiply*/
13350 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013351};
13352
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013354 (lenfunc) unicode_length, /* sq_length */
13355 PyUnicode_Concat, /* sq_concat */
13356 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13357 (ssizeargfunc) unicode_getitem, /* sq_item */
13358 0, /* sq_slice */
13359 0, /* sq_ass_item */
13360 0, /* sq_ass_slice */
13361 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362};
13363
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013364static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013365unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013367 if (PyUnicode_READY(self) == -1)
13368 return NULL;
13369
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013370 if (PyIndex_Check(item)) {
13371 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013372 if (i == -1 && PyErr_Occurred())
13373 return NULL;
13374 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013375 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013376 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013377 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013378 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013379 PyObject *result;
13380 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013381 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013382 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013383
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013384 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013385 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013386 return NULL;
13387 }
13388
13389 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013390 Py_INCREF(unicode_empty);
13391 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013392 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013393 slicelength == PyUnicode_GET_LENGTH(self)) {
13394 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013395 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013396 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013397 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013398 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013399 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013400 src_kind = PyUnicode_KIND(self);
13401 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013402 if (!PyUnicode_IS_ASCII(self)) {
13403 kind_limit = kind_maxchar_limit(src_kind);
13404 max_char = 0;
13405 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13406 ch = PyUnicode_READ(src_kind, src_data, cur);
13407 if (ch > max_char) {
13408 max_char = ch;
13409 if (max_char >= kind_limit)
13410 break;
13411 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013412 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013413 }
Victor Stinner55c99112011-10-13 01:17:06 +020013414 else
13415 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013416 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013417 if (result == NULL)
13418 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013419 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013420 dest_data = PyUnicode_DATA(result);
13421
13422 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013423 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13424 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013425 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013426 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013427 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013428 } else {
13429 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13430 return NULL;
13431 }
13432}
13433
13434static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013435 (lenfunc)unicode_length, /* mp_length */
13436 (binaryfunc)unicode_subscript, /* mp_subscript */
13437 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013438};
13439
Guido van Rossumd57fd912000-03-10 22:53:23 +000013440
Guido van Rossumd57fd912000-03-10 22:53:23 +000013441/* Helpers for PyUnicode_Format() */
13442
13443static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013444getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013445{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013446 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013447 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 (*p_argidx)++;
13449 if (arglen < 0)
13450 return args;
13451 else
13452 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013453 }
13454 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013455 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013456 return NULL;
13457}
13458
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013459/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013460
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013461static PyObject *
13462formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013463{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013464 char *p;
13465 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013466 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013467
Guido van Rossumd57fd912000-03-10 22:53:23 +000013468 x = PyFloat_AsDouble(v);
13469 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013470 return NULL;
13471
Guido van Rossumd57fd912000-03-10 22:53:23 +000013472 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013473 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013474
Eric Smith0923d1d2009-04-16 20:16:10 +000013475 p = PyOS_double_to_string(x, type, prec,
13476 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013477 if (p == NULL)
13478 return NULL;
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013479 result = unicode_fromascii((unsigned char*)p, strlen(p));
Eric Smith0923d1d2009-04-16 20:16:10 +000013480 PyMem_Free(p);
13481 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013482}
13483
Victor Stinnerd0880d52012-04-27 23:40:13 +020013484/* formatlong() emulates the format codes d, u, o, x and X, and
13485 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13486 * Python's regular ints.
13487 * Return value: a new PyUnicodeObject*, or NULL if error.
13488 * The output string is of the form
13489 * "-"? ("0x" | "0X")? digit+
13490 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13491 * set in flags. The case of hex digits will be correct,
13492 * There will be at least prec digits, zero-filled on the left if
13493 * necessary to get that many.
13494 * val object to be converted
13495 * flags bitmask of format flags; only F_ALT is looked at
13496 * prec minimum number of digits; 0-fill on left if needed
13497 * type a character in [duoxX]; u acts the same as d
13498 *
13499 * CAUTION: o, x and X conversions on regular ints can never
13500 * produce a '-' sign, but can for Python's unbounded ints.
13501 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013502static PyObject*
13503formatlong(PyObject *val, int flags, int prec, int type)
13504{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013505 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013506 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013507 Py_ssize_t i;
13508 int sign; /* 1 if '-', else 0 */
13509 int len; /* number of characters */
13510 Py_ssize_t llen;
13511 int numdigits; /* len == numnondigits + numdigits */
13512 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013513
Victor Stinnerd0880d52012-04-27 23:40:13 +020013514 /* Avoid exceeding SSIZE_T_MAX */
13515 if (prec > INT_MAX-3) {
13516 PyErr_SetString(PyExc_OverflowError,
13517 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013518 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013519 }
13520
13521 assert(PyLong_Check(val));
13522
13523 switch (type) {
13524 case 'd':
13525 case 'u':
13526 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013527 if (PyBool_Check(val))
13528 result = PyNumber_ToBase(val, 10);
13529 else
13530 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013531 break;
13532 case 'o':
13533 numnondigits = 2;
13534 result = PyNumber_ToBase(val, 8);
13535 break;
13536 case 'x':
13537 case 'X':
13538 numnondigits = 2;
13539 result = PyNumber_ToBase(val, 16);
13540 break;
13541 default:
13542 assert(!"'type' not in [duoxX]");
13543 }
13544 if (!result)
13545 return NULL;
13546
13547 assert(unicode_modifiable(result));
13548 assert(PyUnicode_IS_READY(result));
13549 assert(PyUnicode_IS_ASCII(result));
13550
13551 /* To modify the string in-place, there can only be one reference. */
13552 if (Py_REFCNT(result) != 1) {
13553 PyErr_BadInternalCall();
13554 return NULL;
13555 }
13556 buf = PyUnicode_DATA(result);
13557 llen = PyUnicode_GET_LENGTH(result);
13558 if (llen > INT_MAX) {
13559 PyErr_SetString(PyExc_ValueError,
13560 "string too large in _PyBytes_FormatLong");
13561 return NULL;
13562 }
13563 len = (int)llen;
13564 sign = buf[0] == '-';
13565 numnondigits += sign;
13566 numdigits = len - numnondigits;
13567 assert(numdigits > 0);
13568
13569 /* Get rid of base marker unless F_ALT */
13570 if (((flags & F_ALT) == 0 &&
13571 (type == 'o' || type == 'x' || type == 'X'))) {
13572 assert(buf[sign] == '0');
13573 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13574 buf[sign+1] == 'o');
13575 numnondigits -= 2;
13576 buf += 2;
13577 len -= 2;
13578 if (sign)
13579 buf[0] = '-';
13580 assert(len == numnondigits + numdigits);
13581 assert(numdigits > 0);
13582 }
13583
13584 /* Fill with leading zeroes to meet minimum width. */
13585 if (prec > numdigits) {
13586 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13587 numnondigits + prec);
13588 char *b1;
13589 if (!r1) {
13590 Py_DECREF(result);
13591 return NULL;
13592 }
13593 b1 = PyBytes_AS_STRING(r1);
13594 for (i = 0; i < numnondigits; ++i)
13595 *b1++ = *buf++;
13596 for (i = 0; i < prec - numdigits; i++)
13597 *b1++ = '0';
13598 for (i = 0; i < numdigits; i++)
13599 *b1++ = *buf++;
13600 *b1 = '\0';
13601 Py_DECREF(result);
13602 result = r1;
13603 buf = PyBytes_AS_STRING(result);
13604 len = numnondigits + prec;
13605 }
13606
13607 /* Fix up case for hex conversions. */
13608 if (type == 'X') {
13609 /* Need to convert all lower case letters to upper case.
13610 and need to convert 0x to 0X (and -0x to -0X). */
13611 for (i = 0; i < len; i++)
13612 if (buf[i] >= 'a' && buf[i] <= 'x')
13613 buf[i] -= 'a'-'A';
13614 }
13615 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13616 PyObject *unicode;
13617 unicode = unicode_fromascii((unsigned char *)buf, len);
13618 Py_DECREF(result);
13619 result = unicode;
13620 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013621 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013622}
13623
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013624static Py_UCS4
13625formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013626{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013627 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013628 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013629 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013630 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013631 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013632 goto onError;
13633 }
13634 else {
13635 /* Integer input truncated to a character */
13636 long x;
13637 x = PyLong_AsLong(v);
13638 if (x == -1 && PyErr_Occurred())
13639 goto onError;
13640
Victor Stinner8faf8212011-12-08 22:14:11 +010013641 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013642 PyErr_SetString(PyExc_OverflowError,
13643 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013644 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013645 }
13646
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013647 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013648 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013649
Benjamin Peterson29060642009-01-31 22:14:21 +000013650 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013651 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013652 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013653 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013654}
13655
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013656static int
13657repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13658{
13659 int r;
13660 assert(count > 0);
13661 assert(PyUnicode_Check(obj));
13662 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013663 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013664 if (repeated == NULL)
13665 return -1;
13666 r = _PyAccu_Accumulate(acc, repeated);
13667 Py_DECREF(repeated);
13668 return r;
13669 }
13670 else {
13671 do {
13672 if (_PyAccu_Accumulate(acc, obj))
13673 return -1;
13674 } while (--count);
13675 return 0;
13676 }
13677}
13678
Alexander Belopolsky40018472011-02-26 01:02:56 +000013679PyObject *
13680PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013681{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013682 void *fmt;
13683 int fmtkind;
13684 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013685 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013686 int r;
13687 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013688 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013689 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013690 PyObject *temp = NULL;
13691 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013692 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013693 _PyAccu acc;
13694 static PyObject *plus, *minus, *blank, *zero, *percent;
13695
13696 if (!plus && !(plus = get_latin1_char('+')))
13697 return NULL;
13698 if (!minus && !(minus = get_latin1_char('-')))
13699 return NULL;
13700 if (!blank && !(blank = get_latin1_char(' ')))
13701 return NULL;
13702 if (!zero && !(zero = get_latin1_char('0')))
13703 return NULL;
13704 if (!percent && !(percent = get_latin1_char('%')))
13705 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013706
Guido van Rossumd57fd912000-03-10 22:53:23 +000013707 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013708 PyErr_BadInternalCall();
13709 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013710 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013711 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013712 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013713 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013714 if (PyUnicode_READY(uformat) == -1)
13715 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013716 if (_PyAccu_Init(&acc))
13717 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013718 fmt = PyUnicode_DATA(uformat);
13719 fmtkind = PyUnicode_KIND(uformat);
13720 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13721 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013722
Guido van Rossumd57fd912000-03-10 22:53:23 +000013723 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013724 arglen = PyTuple_Size(args);
13725 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013726 }
13727 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013728 arglen = -1;
13729 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013730 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013731 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013732 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013733 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013734
13735 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013736 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013737 PyObject *nonfmt;
13738 Py_ssize_t nonfmtpos;
13739 nonfmtpos = fmtpos++;
13740 while (fmtcnt >= 0 &&
13741 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13742 fmtpos++;
13743 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013744 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013745 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013746 if (nonfmt == NULL)
13747 goto onError;
13748 r = _PyAccu_Accumulate(&acc, nonfmt);
13749 Py_DECREF(nonfmt);
13750 if (r)
13751 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013752 }
13753 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013754 /* Got a format specifier */
13755 int flags = 0;
13756 Py_ssize_t width = -1;
13757 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013758 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013759 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013760 int isnumok;
13761 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013762 void *pbuf = NULL;
13763 Py_ssize_t pindex, len;
13764 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013766 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013767 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13768 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013769 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013770 Py_ssize_t keylen;
13771 PyObject *key;
13772 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013773
Benjamin Peterson29060642009-01-31 22:14:21 +000013774 if (dict == NULL) {
13775 PyErr_SetString(PyExc_TypeError,
13776 "format requires a mapping");
13777 goto onError;
13778 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013779 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013780 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013781 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013782 /* Skip over balanced parentheses */
13783 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013784 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13785 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013786 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013787 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013788 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013789 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013790 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013791 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013792 if (fmtcnt < 0 || pcount > 0) {
13793 PyErr_SetString(PyExc_ValueError,
13794 "incomplete format key");
13795 goto onError;
13796 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013797 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013798 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013799 if (key == NULL)
13800 goto onError;
13801 if (args_owned) {
13802 Py_DECREF(args);
13803 args_owned = 0;
13804 }
13805 args = PyObject_GetItem(dict, key);
13806 Py_DECREF(key);
13807 if (args == NULL) {
13808 goto onError;
13809 }
13810 args_owned = 1;
13811 arglen = -1;
13812 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013813 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013814 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013815 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13816 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013817 case '-': flags |= F_LJUST; continue;
13818 case '+': flags |= F_SIGN; continue;
13819 case ' ': flags |= F_BLANK; continue;
13820 case '#': flags |= F_ALT; continue;
13821 case '0': flags |= F_ZERO; continue;
13822 }
13823 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013824 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013825 if (c == '*') {
13826 v = getnextarg(args, arglen, &argidx);
13827 if (v == NULL)
13828 goto onError;
13829 if (!PyLong_Check(v)) {
13830 PyErr_SetString(PyExc_TypeError,
13831 "* wants int");
13832 goto onError;
13833 }
13834 width = PyLong_AsLong(v);
13835 if (width == -1 && PyErr_Occurred())
13836 goto onError;
13837 if (width < 0) {
13838 flags |= F_LJUST;
13839 width = -width;
13840 }
13841 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013842 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013843 }
13844 else if (c >= '0' && c <= '9') {
13845 width = c - '0';
13846 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013847 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013848 if (c < '0' || c > '9')
13849 break;
13850 if ((width*10) / 10 != width) {
13851 PyErr_SetString(PyExc_ValueError,
13852 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013853 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013854 }
13855 width = width*10 + (c - '0');
13856 }
13857 }
13858 if (c == '.') {
13859 prec = 0;
13860 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013861 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013862 if (c == '*') {
13863 v = getnextarg(args, arglen, &argidx);
13864 if (v == NULL)
13865 goto onError;
13866 if (!PyLong_Check(v)) {
13867 PyErr_SetString(PyExc_TypeError,
13868 "* wants int");
13869 goto onError;
13870 }
13871 prec = PyLong_AsLong(v);
13872 if (prec == -1 && PyErr_Occurred())
13873 goto onError;
13874 if (prec < 0)
13875 prec = 0;
13876 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013877 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013878 }
13879 else if (c >= '0' && c <= '9') {
13880 prec = c - '0';
13881 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013882 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013883 if (c < '0' || c > '9')
13884 break;
13885 if ((prec*10) / 10 != prec) {
13886 PyErr_SetString(PyExc_ValueError,
13887 "prec too big");
13888 goto onError;
13889 }
13890 prec = prec*10 + (c - '0');
13891 }
13892 }
13893 } /* prec */
13894 if (fmtcnt >= 0) {
13895 if (c == 'h' || c == 'l' || c == 'L') {
13896 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013897 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013898 }
13899 }
13900 if (fmtcnt < 0) {
13901 PyErr_SetString(PyExc_ValueError,
13902 "incomplete format");
13903 goto onError;
13904 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013905
13906 if (c == '%') {
13907 _PyAccu_Accumulate(&acc, percent);
13908 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013909 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013910
13911
13912 v = getnextarg(args, arglen, &argidx);
13913 if (v == NULL)
13914 goto onError;
13915
Benjamin Peterson29060642009-01-31 22:14:21 +000013916 sign = 0;
13917 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013918 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013919 switch (c) {
13920
Benjamin Peterson29060642009-01-31 22:14:21 +000013921 case 's':
13922 case 'r':
13923 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013924 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013925 temp = v;
13926 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013927 }
13928 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013929 if (c == 's')
13930 temp = PyObject_Str(v);
13931 else if (c == 'r')
13932 temp = PyObject_Repr(v);
13933 else
13934 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013935 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013936 break;
13937
13938 case 'i':
13939 case 'd':
13940 case 'u':
13941 case 'o':
13942 case 'x':
13943 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013944 isnumok = 0;
13945 if (PyNumber_Check(v)) {
13946 PyObject *iobj=NULL;
13947
13948 if (PyLong_Check(v)) {
13949 iobj = v;
13950 Py_INCREF(iobj);
13951 }
13952 else {
13953 iobj = PyNumber_Long(v);
13954 }
13955 if (iobj!=NULL) {
13956 if (PyLong_Check(iobj)) {
13957 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013958 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013959 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013960 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013961 }
13962 else {
13963 Py_DECREF(iobj);
13964 }
13965 }
13966 }
13967 if (!isnumok) {
13968 PyErr_Format(PyExc_TypeError,
13969 "%%%c format: a number is required, "
13970 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13971 goto onError;
13972 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013973 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013974 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013975 fillobj = zero;
13976 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013977 break;
13978
13979 case 'e':
13980 case 'E':
13981 case 'f':
13982 case 'F':
13983 case 'g':
13984 case 'G':
Benjamin Peterson29060642009-01-31 22:14:21 +000013985 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013986 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013987 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013988 fillobj = zero;
13989 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013990 temp = formatfloat(v, flags, prec, c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013991 break;
13992
13993 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013994 {
13995 Py_UCS4 ch = formatchar(v);
13996 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013997 goto onError;
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013998 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000013999 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014000 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014001
14002 default:
14003 PyErr_Format(PyExc_ValueError,
14004 "unsupported format character '%c' (0x%x) "
14005 "at index %zd",
14006 (31<=c && c<=126) ? (char)c : '?',
14007 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014008 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000014009 goto onError;
14010 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014011 if (temp == NULL)
14012 goto onError;
14013 assert (PyUnicode_Check(temp));
14014 if (PyUnicode_READY(temp) == -1) {
14015 Py_CLEAR(temp);
14016 goto onError;
14017 }
14018 kind = PyUnicode_KIND(temp);
14019 pbuf = PyUnicode_DATA(temp);
14020 len = PyUnicode_GET_LENGTH(temp);
14021
14022 if (c == 's' || c == 'r' || c == 'a') {
14023 if (prec >= 0 && len > prec)
14024 len = prec;
14025 }
14026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014027 /* pbuf is initialized here. */
14028 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000014029 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014030 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
14031 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000014032 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014033 pindex++;
14034 }
14035 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
14036 signobj = plus;
14037 len--;
14038 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000014039 }
14040 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014041 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000014042 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014043 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000014044 else
14045 sign = 0;
14046 }
14047 if (width < len)
14048 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000014049 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014050 if (fill != ' ') {
14051 assert(signobj != NULL);
14052 if (_PyAccu_Accumulate(&acc, signobj))
14053 goto onError;
14054 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014055 if (width > len)
14056 width--;
14057 }
14058 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014059 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014060 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000014061 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014062 second = get_latin1_char(
14063 PyUnicode_READ(kind, pbuf, pindex + 1));
14064 pindex += 2;
14065 if (second == NULL ||
14066 _PyAccu_Accumulate(&acc, zero) ||
14067 _PyAccu_Accumulate(&acc, second))
14068 goto onError;
14069 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000014070 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014071 width -= 2;
14072 if (width < 0)
14073 width = 0;
14074 len -= 2;
14075 }
14076 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014077 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020014078 if (repeat_accumulate(&acc, fillobj, width - len))
14079 goto onError;
14080 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000014081 }
14082 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014083 if (sign) {
14084 assert(signobj != NULL);
14085 if (_PyAccu_Accumulate(&acc, signobj))
14086 goto onError;
14087 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014088 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014089 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14090 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014091 second = get_latin1_char(
14092 PyUnicode_READ(kind, pbuf, pindex + 1));
14093 pindex += 2;
14094 if (second == NULL ||
14095 _PyAccu_Accumulate(&acc, zero) ||
14096 _PyAccu_Accumulate(&acc, second))
14097 goto onError;
14098 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014099 }
14100 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014101 /* Copy all characters, preserving len */
Victor Stinnerb80e46e2012-04-30 05:21:52 +020014102 if (pindex == 0 && len == PyUnicode_GET_LENGTH(temp)) {
14103 r = _PyAccu_Accumulate(&acc, temp);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014104 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014105 else {
Victor Stinnerb80e46e2012-04-30 05:21:52 +020014106 v = PyUnicode_Substring(temp, pindex, pindex + len);
14107 if (v == NULL)
14108 goto onError;
14109 r = _PyAccu_Accumulate(&acc, v);
14110 Py_DECREF(v);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014111 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014112 if (r)
14113 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020014114 if (width > len && repeat_accumulate(&acc, blank, width - len))
14115 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000014116 if (dict && (argidx < arglen) && c != '%') {
14117 PyErr_SetString(PyExc_TypeError,
14118 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000014119 goto onError;
14120 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014121 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000014122 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014123 } /* until end */
14124 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014125 PyErr_SetString(PyExc_TypeError,
14126 "not all arguments converted during string formatting");
14127 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014128 }
14129
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014130 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014131 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014132 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014133 }
14134 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014135 Py_XDECREF(temp);
14136 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014137 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014138
Benjamin Peterson29060642009-01-31 22:14:21 +000014139 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000014140 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014141 Py_XDECREF(temp);
14142 Py_XDECREF(second);
14143 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014144 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014145 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014146 }
14147 return NULL;
14148}
14149
Jeremy Hylton938ace62002-07-17 16:30:39 +000014150static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014151unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14152
Tim Peters6d6c1a32001-08-02 04:15:00 +000014153static PyObject *
14154unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14155{
Benjamin Peterson29060642009-01-31 22:14:21 +000014156 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014157 static char *kwlist[] = {"object", "encoding", "errors", 0};
14158 char *encoding = NULL;
14159 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014160
Benjamin Peterson14339b62009-01-31 16:36:08 +000014161 if (type != &PyUnicode_Type)
14162 return unicode_subtype_new(type, args, kwds);
14163 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014164 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014165 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014166 if (x == NULL) {
14167 Py_INCREF(unicode_empty);
14168 return unicode_empty;
14169 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014170 if (encoding == NULL && errors == NULL)
14171 return PyObject_Str(x);
14172 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014173 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014174}
14175
Guido van Rossume023fe02001-08-30 03:12:59 +000014176static PyObject *
14177unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14178{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014179 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014180 Py_ssize_t length, char_size;
14181 int share_wstr, share_utf8;
14182 unsigned int kind;
14183 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014184
Benjamin Peterson14339b62009-01-31 16:36:08 +000014185 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014186
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014187 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014188 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014189 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014190 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014191 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014192 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014193 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014194 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014195
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014196 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014197 if (self == NULL) {
14198 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014199 return NULL;
14200 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014201 kind = PyUnicode_KIND(unicode);
14202 length = PyUnicode_GET_LENGTH(unicode);
14203
14204 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014205#ifdef Py_DEBUG
14206 _PyUnicode_HASH(self) = -1;
14207#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014208 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014209#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014210 _PyUnicode_STATE(self).interned = 0;
14211 _PyUnicode_STATE(self).kind = kind;
14212 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014213 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014214 _PyUnicode_STATE(self).ready = 1;
14215 _PyUnicode_WSTR(self) = NULL;
14216 _PyUnicode_UTF8_LENGTH(self) = 0;
14217 _PyUnicode_UTF8(self) = NULL;
14218 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014219 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014220
14221 share_utf8 = 0;
14222 share_wstr = 0;
14223 if (kind == PyUnicode_1BYTE_KIND) {
14224 char_size = 1;
14225 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14226 share_utf8 = 1;
14227 }
14228 else if (kind == PyUnicode_2BYTE_KIND) {
14229 char_size = 2;
14230 if (sizeof(wchar_t) == 2)
14231 share_wstr = 1;
14232 }
14233 else {
14234 assert(kind == PyUnicode_4BYTE_KIND);
14235 char_size = 4;
14236 if (sizeof(wchar_t) == 4)
14237 share_wstr = 1;
14238 }
14239
14240 /* Ensure we won't overflow the length. */
14241 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14242 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014243 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014244 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014245 data = PyObject_MALLOC((length + 1) * char_size);
14246 if (data == NULL) {
14247 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014248 goto onError;
14249 }
14250
Victor Stinnerc3c74152011-10-02 20:39:55 +020014251 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014252 if (share_utf8) {
14253 _PyUnicode_UTF8_LENGTH(self) = length;
14254 _PyUnicode_UTF8(self) = data;
14255 }
14256 if (share_wstr) {
14257 _PyUnicode_WSTR_LENGTH(self) = length;
14258 _PyUnicode_WSTR(self) = (wchar_t *)data;
14259 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014260
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014261 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014262 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014263 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014264#ifdef Py_DEBUG
14265 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14266#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014267 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014268 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014269
14270onError:
14271 Py_DECREF(unicode);
14272 Py_DECREF(self);
14273 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014274}
14275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014276PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000014277 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014278\n\
Collin Winterd474ce82007-08-07 19:42:11 +000014279Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000014280encoding defaults to the current default string encoding.\n\
14281errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014282
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014283static PyObject *unicode_iter(PyObject *seq);
14284
Guido van Rossumd57fd912000-03-10 22:53:23 +000014285PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014286 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014287 "str", /* tp_name */
14288 sizeof(PyUnicodeObject), /* tp_size */
14289 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014290 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014291 (destructor)unicode_dealloc, /* tp_dealloc */
14292 0, /* tp_print */
14293 0, /* tp_getattr */
14294 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014295 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014296 unicode_repr, /* tp_repr */
14297 &unicode_as_number, /* tp_as_number */
14298 &unicode_as_sequence, /* tp_as_sequence */
14299 &unicode_as_mapping, /* tp_as_mapping */
14300 (hashfunc) unicode_hash, /* tp_hash*/
14301 0, /* tp_call*/
14302 (reprfunc) unicode_str, /* tp_str */
14303 PyObject_GenericGetAttr, /* tp_getattro */
14304 0, /* tp_setattro */
14305 0, /* tp_as_buffer */
14306 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014307 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014308 unicode_doc, /* tp_doc */
14309 0, /* tp_traverse */
14310 0, /* tp_clear */
14311 PyUnicode_RichCompare, /* tp_richcompare */
14312 0, /* tp_weaklistoffset */
14313 unicode_iter, /* tp_iter */
14314 0, /* tp_iternext */
14315 unicode_methods, /* tp_methods */
14316 0, /* tp_members */
14317 0, /* tp_getset */
14318 &PyBaseObject_Type, /* tp_base */
14319 0, /* tp_dict */
14320 0, /* tp_descr_get */
14321 0, /* tp_descr_set */
14322 0, /* tp_dictoffset */
14323 0, /* tp_init */
14324 0, /* tp_alloc */
14325 unicode_new, /* tp_new */
14326 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014327};
14328
14329/* Initialize the Unicode implementation */
14330
Victor Stinner3a50e702011-10-18 21:21:00 +020014331int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014332{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014333 int i;
14334
Thomas Wouters477c8d52006-05-27 19:21:47 +000014335 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014336 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014337 0x000A, /* LINE FEED */
14338 0x000D, /* CARRIAGE RETURN */
14339 0x001C, /* FILE SEPARATOR */
14340 0x001D, /* GROUP SEPARATOR */
14341 0x001E, /* RECORD SEPARATOR */
14342 0x0085, /* NEXT LINE */
14343 0x2028, /* LINE SEPARATOR */
14344 0x2029, /* PARAGRAPH SEPARATOR */
14345 };
14346
Fred Drakee4315f52000-05-09 19:53:39 +000014347 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014348 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014349 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014350 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014351 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014352
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014353 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014354 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014355 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014356 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014357
14358 /* initialize the linebreak bloom filter */
14359 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014360 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014361 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014362
14363 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014364
14365#ifdef HAVE_MBCS
14366 winver.dwOSVersionInfoSize = sizeof(winver);
14367 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14368 PyErr_SetFromWindowsErr(0);
14369 return -1;
14370 }
14371#endif
14372 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014373}
14374
14375/* Finalize the Unicode implementation */
14376
Christian Heimesa156e092008-02-16 07:38:31 +000014377int
14378PyUnicode_ClearFreeList(void)
14379{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014380 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014381}
14382
Guido van Rossumd57fd912000-03-10 22:53:23 +000014383void
Thomas Wouters78890102000-07-22 19:25:51 +000014384_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014385{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014386 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014387
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014388 Py_XDECREF(unicode_empty);
14389 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014390
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014391 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014392 if (unicode_latin1[i]) {
14393 Py_DECREF(unicode_latin1[i]);
14394 unicode_latin1[i] = NULL;
14395 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014396 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014397 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014398 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014399}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014400
Walter Dörwald16807132007-05-25 13:52:07 +000014401void
14402PyUnicode_InternInPlace(PyObject **p)
14403{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014404 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014405 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014406#ifdef Py_DEBUG
14407 assert(s != NULL);
14408 assert(_PyUnicode_CHECK(s));
14409#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014410 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014411 return;
14412#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014413 /* If it's a subclass, we don't really know what putting
14414 it in the interned dict might do. */
14415 if (!PyUnicode_CheckExact(s))
14416 return;
14417 if (PyUnicode_CHECK_INTERNED(s))
14418 return;
14419 if (interned == NULL) {
14420 interned = PyDict_New();
14421 if (interned == NULL) {
14422 PyErr_Clear(); /* Don't leave an exception */
14423 return;
14424 }
14425 }
14426 /* It might be that the GetItem call fails even
14427 though the key is present in the dictionary,
14428 namely when this happens during a stack overflow. */
14429 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014430 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014431 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014432
Benjamin Peterson29060642009-01-31 22:14:21 +000014433 if (t) {
14434 Py_INCREF(t);
14435 Py_DECREF(*p);
14436 *p = t;
14437 return;
14438 }
Walter Dörwald16807132007-05-25 13:52:07 +000014439
Benjamin Peterson14339b62009-01-31 16:36:08 +000014440 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014441 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014442 PyErr_Clear();
14443 PyThreadState_GET()->recursion_critical = 0;
14444 return;
14445 }
14446 PyThreadState_GET()->recursion_critical = 0;
14447 /* The two references in interned are not counted by refcnt.
14448 The deallocator will take care of this */
14449 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014450 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014451}
14452
14453void
14454PyUnicode_InternImmortal(PyObject **p)
14455{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014456 PyUnicode_InternInPlace(p);
14457 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014458 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014459 Py_INCREF(*p);
14460 }
Walter Dörwald16807132007-05-25 13:52:07 +000014461}
14462
14463PyObject *
14464PyUnicode_InternFromString(const char *cp)
14465{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014466 PyObject *s = PyUnicode_FromString(cp);
14467 if (s == NULL)
14468 return NULL;
14469 PyUnicode_InternInPlace(&s);
14470 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014471}
14472
Alexander Belopolsky40018472011-02-26 01:02:56 +000014473void
14474_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014475{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014476 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014477 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014478 Py_ssize_t i, n;
14479 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014480
Benjamin Peterson14339b62009-01-31 16:36:08 +000014481 if (interned == NULL || !PyDict_Check(interned))
14482 return;
14483 keys = PyDict_Keys(interned);
14484 if (keys == NULL || !PyList_Check(keys)) {
14485 PyErr_Clear();
14486 return;
14487 }
Walter Dörwald16807132007-05-25 13:52:07 +000014488
Benjamin Peterson14339b62009-01-31 16:36:08 +000014489 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14490 detector, interned unicode strings are not forcibly deallocated;
14491 rather, we give them their stolen references back, and then clear
14492 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014493
Benjamin Peterson14339b62009-01-31 16:36:08 +000014494 n = PyList_GET_SIZE(keys);
14495 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014496 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014497 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014498 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014499 if (PyUnicode_READY(s) == -1) {
14500 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014501 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014503 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014504 case SSTATE_NOT_INTERNED:
14505 /* XXX Shouldn't happen */
14506 break;
14507 case SSTATE_INTERNED_IMMORTAL:
14508 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014509 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014510 break;
14511 case SSTATE_INTERNED_MORTAL:
14512 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014513 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014514 break;
14515 default:
14516 Py_FatalError("Inconsistent interned string state.");
14517 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014518 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014519 }
14520 fprintf(stderr, "total size of all interned strings: "
14521 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14522 "mortal/immortal\n", mortal_size, immortal_size);
14523 Py_DECREF(keys);
14524 PyDict_Clear(interned);
14525 Py_DECREF(interned);
14526 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014527}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014528
14529
14530/********************* Unicode Iterator **************************/
14531
14532typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014533 PyObject_HEAD
14534 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014535 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014536} unicodeiterobject;
14537
14538static void
14539unicodeiter_dealloc(unicodeiterobject *it)
14540{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014541 _PyObject_GC_UNTRACK(it);
14542 Py_XDECREF(it->it_seq);
14543 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014544}
14545
14546static int
14547unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14548{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014549 Py_VISIT(it->it_seq);
14550 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014551}
14552
14553static PyObject *
14554unicodeiter_next(unicodeiterobject *it)
14555{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014556 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014557
Benjamin Peterson14339b62009-01-31 16:36:08 +000014558 assert(it != NULL);
14559 seq = it->it_seq;
14560 if (seq == NULL)
14561 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014562 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014564 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14565 int kind = PyUnicode_KIND(seq);
14566 void *data = PyUnicode_DATA(seq);
14567 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14568 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014569 if (item != NULL)
14570 ++it->it_index;
14571 return item;
14572 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014573
Benjamin Peterson14339b62009-01-31 16:36:08 +000014574 Py_DECREF(seq);
14575 it->it_seq = NULL;
14576 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014577}
14578
14579static PyObject *
14580unicodeiter_len(unicodeiterobject *it)
14581{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014582 Py_ssize_t len = 0;
14583 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014584 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014585 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014586}
14587
14588PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14589
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014590static PyObject *
14591unicodeiter_reduce(unicodeiterobject *it)
14592{
14593 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014594 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014595 it->it_seq, it->it_index);
14596 } else {
14597 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14598 if (u == NULL)
14599 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014600 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014601 }
14602}
14603
14604PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14605
14606static PyObject *
14607unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14608{
14609 Py_ssize_t index = PyLong_AsSsize_t(state);
14610 if (index == -1 && PyErr_Occurred())
14611 return NULL;
14612 if (index < 0)
14613 index = 0;
14614 it->it_index = index;
14615 Py_RETURN_NONE;
14616}
14617
14618PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14619
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014620static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014621 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014622 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014623 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14624 reduce_doc},
14625 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14626 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014627 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014628};
14629
14630PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014631 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14632 "str_iterator", /* tp_name */
14633 sizeof(unicodeiterobject), /* tp_basicsize */
14634 0, /* tp_itemsize */
14635 /* methods */
14636 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14637 0, /* tp_print */
14638 0, /* tp_getattr */
14639 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014640 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014641 0, /* tp_repr */
14642 0, /* tp_as_number */
14643 0, /* tp_as_sequence */
14644 0, /* tp_as_mapping */
14645 0, /* tp_hash */
14646 0, /* tp_call */
14647 0, /* tp_str */
14648 PyObject_GenericGetAttr, /* tp_getattro */
14649 0, /* tp_setattro */
14650 0, /* tp_as_buffer */
14651 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14652 0, /* tp_doc */
14653 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14654 0, /* tp_clear */
14655 0, /* tp_richcompare */
14656 0, /* tp_weaklistoffset */
14657 PyObject_SelfIter, /* tp_iter */
14658 (iternextfunc)unicodeiter_next, /* tp_iternext */
14659 unicodeiter_methods, /* tp_methods */
14660 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014661};
14662
14663static PyObject *
14664unicode_iter(PyObject *seq)
14665{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014666 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014667
Benjamin Peterson14339b62009-01-31 16:36:08 +000014668 if (!PyUnicode_Check(seq)) {
14669 PyErr_BadInternalCall();
14670 return NULL;
14671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014672 if (PyUnicode_READY(seq) == -1)
14673 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014674 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14675 if (it == NULL)
14676 return NULL;
14677 it->it_index = 0;
14678 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014679 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014680 _PyObject_GC_TRACK(it);
14681 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014682}
14683
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014684
14685size_t
14686Py_UNICODE_strlen(const Py_UNICODE *u)
14687{
14688 int res = 0;
14689 while(*u++)
14690 res++;
14691 return res;
14692}
14693
14694Py_UNICODE*
14695Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14696{
14697 Py_UNICODE *u = s1;
14698 while ((*u++ = *s2++));
14699 return s1;
14700}
14701
14702Py_UNICODE*
14703Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14704{
14705 Py_UNICODE *u = s1;
14706 while ((*u++ = *s2++))
14707 if (n-- == 0)
14708 break;
14709 return s1;
14710}
14711
14712Py_UNICODE*
14713Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14714{
14715 Py_UNICODE *u1 = s1;
14716 u1 += Py_UNICODE_strlen(u1);
14717 Py_UNICODE_strcpy(u1, s2);
14718 return s1;
14719}
14720
14721int
14722Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14723{
14724 while (*s1 && *s2 && *s1 == *s2)
14725 s1++, s2++;
14726 if (*s1 && *s2)
14727 return (*s1 < *s2) ? -1 : +1;
14728 if (*s1)
14729 return 1;
14730 if (*s2)
14731 return -1;
14732 return 0;
14733}
14734
14735int
14736Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14737{
14738 register Py_UNICODE u1, u2;
14739 for (; n != 0; n--) {
14740 u1 = *s1;
14741 u2 = *s2;
14742 if (u1 != u2)
14743 return (u1 < u2) ? -1 : +1;
14744 if (u1 == '\0')
14745 return 0;
14746 s1++;
14747 s2++;
14748 }
14749 return 0;
14750}
14751
14752Py_UNICODE*
14753Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14754{
14755 const Py_UNICODE *p;
14756 for (p = s; *p; p++)
14757 if (*p == c)
14758 return (Py_UNICODE*)p;
14759 return NULL;
14760}
14761
14762Py_UNICODE*
14763Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14764{
14765 const Py_UNICODE *p;
14766 p = s + Py_UNICODE_strlen(s);
14767 while (p != s) {
14768 p--;
14769 if (*p == c)
14770 return (Py_UNICODE*)p;
14771 }
14772 return NULL;
14773}
Victor Stinner331ea922010-08-10 16:37:20 +000014774
Victor Stinner71133ff2010-09-01 23:43:53 +000014775Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014776PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014777{
Victor Stinner577db2c2011-10-11 22:12:48 +020014778 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014779 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014781 if (!PyUnicode_Check(unicode)) {
14782 PyErr_BadArgument();
14783 return NULL;
14784 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014785 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014786 if (u == NULL)
14787 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014788 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014789 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014790 PyErr_NoMemory();
14791 return NULL;
14792 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014793 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014794 size *= sizeof(Py_UNICODE);
14795 copy = PyMem_Malloc(size);
14796 if (copy == NULL) {
14797 PyErr_NoMemory();
14798 return NULL;
14799 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014800 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014801 return copy;
14802}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014803
Georg Brandl66c221e2010-10-14 07:04:07 +000014804/* A _string module, to export formatter_parser and formatter_field_name_split
14805 to the string.Formatter class implemented in Python. */
14806
14807static PyMethodDef _string_methods[] = {
14808 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14809 METH_O, PyDoc_STR("split the argument as a field name")},
14810 {"formatter_parser", (PyCFunction) formatter_parser,
14811 METH_O, PyDoc_STR("parse the argument as a format string")},
14812 {NULL, NULL}
14813};
14814
14815static struct PyModuleDef _string_module = {
14816 PyModuleDef_HEAD_INIT,
14817 "_string",
14818 PyDoc_STR("string helper module"),
14819 0,
14820 _string_methods,
14821 NULL,
14822 NULL,
14823 NULL,
14824 NULL
14825};
14826
14827PyMODINIT_FUNC
14828PyInit__string(void)
14829{
14830 return PyModule_Create(&_string_module);
14831}
14832
14833
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014834#ifdef __cplusplus
14835}
14836#endif