blob: 0ed38fef8f554efdfb4dae831d04a55325ed8003 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinnere6abb482012-05-02 01:15:40 +0200114/* Optimized version of Py_MAX() to compute the maximum character:
115 use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2) \
117 ((maxchar1) | (maxchar2))
118
Victor Stinner910337b2011-10-03 03:20:16 +0200119#undef PyUnicode_READY
120#define PyUnicode_READY(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200123 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100124 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200162 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
182static PyObject *interned;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200185static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000186
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* Single character Unicode strings in the Latin-1 range are being
191 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200192static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193
Christian Heimes190d79e2008-01-30 11:58:22 +0000194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000199/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000200/* case 0x000C: * FORM FEED */
201/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 1, 1, 1, 1, 1, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x001C: * FILE SEPARATOR */
205/* case 0x001D: * GROUP SEPARATOR */
206/* case 0x001E: * RECORD SEPARATOR */
207/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 1, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000214
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000223};
224
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200225/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200227static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100228static int unicode_modifiable(PyObject *unicode);
229
Victor Stinnerfe226c02011-10-03 03:52:20 +0200230
Alexander Belopolsky40018472011-02-26 01:02:56 +0000231static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100241 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
Alexander Belopolsky40018472011-02-26 01:02:56 +0000244static void
245raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300246 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100247 PyObject *unicode,
248 Py_ssize_t startpos, Py_ssize_t endpos,
249 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000250
Christian Heimes190d79e2008-01-30 11:58:22 +0000251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000255/* 0x000B, * LINE TABULATION */
256/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000257/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000258 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x001C, * FILE SEPARATOR */
261/* 0x001D, * GROUP SEPARATOR */
262/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 1, 1, 1, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000277};
278
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000282PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000283{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000284#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 /* This is actually an illegal character, so it should
288 not be passed to unichr. */
289 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290#endif
291}
292
Victor Stinner910337b2011-10-03 03:20:16 +0200293#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200294int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200296{
297 PyASCIIObject *ascii;
298 unsigned int kind;
299
300 assert(PyUnicode_Check(op));
301
302 ascii = (PyASCIIObject *)op;
303 kind = ascii->state.kind;
304
Victor Stinnera3b334d2011-10-03 13:53:37 +0200305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200306 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(ascii->state.ready == 1);
308 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200309 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200311 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200312
Victor Stinnera41463c2011-10-04 01:05:08 +0200313 if (ascii->state.compact == 1) {
314 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND
316 || kind == PyUnicode_2BYTE_KIND
317 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200320 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100321 }
322 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 assert(ascii->length == 0);
328 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert(ascii->state.compact == 0);
330 assert(ascii->state.ascii == 0);
331 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100332 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->wstr != NULL);
334 assert(data == NULL);
335 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 }
337 else {
338 assert(kind == PyUnicode_1BYTE_KIND
339 || kind == PyUnicode_2BYTE_KIND
340 || kind == PyUnicode_4BYTE_KIND);
341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ready == 1);
343 assert(data != NULL);
344 if (ascii->state.ascii) {
345 assert (compact->utf8 == data);
346 assert (compact->utf8_length == ascii->length);
347 }
348 else
349 assert (compact->utf8 != data);
350 }
351 }
352 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200353 if (
354#if SIZEOF_WCHAR_T == 2
355 kind == PyUnicode_2BYTE_KIND
356#else
357 kind == PyUnicode_4BYTE_KIND
358#endif
359 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200360 {
361 assert(ascii->wstr == data);
362 assert(compact->wstr_length == ascii->length);
363 } else
364 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366
367 if (compact->utf8 == NULL)
368 assert(compact->utf8_length == 0);
369 if (ascii->wstr == NULL)
370 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200372 /* check that the best kind is used */
373 if (check_content && kind != PyUnicode_WCHAR_KIND)
374 {
375 Py_ssize_t i;
376 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200377 void *data;
378 Py_UCS4 ch;
379
380 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 for (i=0; i < ascii->length; i++)
382 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200383 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 if (ch > maxchar)
385 maxchar = ch;
386 }
387 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100388 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200389 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100390 assert(maxchar <= 255);
391 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 else
393 assert(maxchar < 128);
394 }
Victor Stinner77faf692011-11-20 18:56:05 +0100395 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100397 assert(maxchar <= 0xFFFF);
398 }
399 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100401 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100402 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200403 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400405 return 1;
406}
Victor Stinner910337b2011-10-03 03:20:16 +0200407#endif
408
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100409static PyObject*
410unicode_result_wchar(PyObject *unicode)
411{
412#ifndef Py_DEBUG
413 Py_ssize_t len;
414
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100415 len = _PyUnicode_WSTR_LENGTH(unicode);
416 if (len == 0) {
417 Py_INCREF(unicode_empty);
418 Py_DECREF(unicode);
419 return unicode_empty;
420 }
421
422 if (len == 1) {
423 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
424 if (ch < 256) {
425 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
426 Py_DECREF(unicode);
427 return latin1_char;
428 }
429 }
430
431 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200432 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 return NULL;
434 }
435#else
Victor Stinneraa771272012-10-04 02:32:58 +0200436 assert(Py_REFCNT(unicode) == 1);
437
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100438 /* don't make the result ready in debug mode to ensure that the caller
439 makes the string ready before using it */
440 assert(_PyUnicode_CheckConsistency(unicode, 1));
441#endif
442 return unicode;
443}
444
445static PyObject*
446unicode_result_ready(PyObject *unicode)
447{
448 Py_ssize_t length;
449
450 length = PyUnicode_GET_LENGTH(unicode);
451 if (length == 0) {
452 if (unicode != unicode_empty) {
453 Py_INCREF(unicode_empty);
454 Py_DECREF(unicode);
455 }
456 return unicode_empty;
457 }
458
459 if (length == 1) {
460 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
461 if (ch < 256) {
462 PyObject *latin1_char = unicode_latin1[ch];
463 if (latin1_char != NULL) {
464 if (unicode != latin1_char) {
465 Py_INCREF(latin1_char);
466 Py_DECREF(unicode);
467 }
468 return latin1_char;
469 }
470 else {
471 assert(_PyUnicode_CheckConsistency(unicode, 1));
472 Py_INCREF(unicode);
473 unicode_latin1[ch] = unicode;
474 return unicode;
475 }
476 }
477 }
478
479 assert(_PyUnicode_CheckConsistency(unicode, 1));
480 return unicode;
481}
482
483static PyObject*
484unicode_result(PyObject *unicode)
485{
486 assert(_PyUnicode_CHECK(unicode));
487 if (PyUnicode_IS_READY(unicode))
488 return unicode_result_ready(unicode);
489 else
490 return unicode_result_wchar(unicode);
491}
492
Victor Stinnerc4b49542011-12-11 22:44:26 +0100493static PyObject*
494unicode_result_unchanged(PyObject *unicode)
495{
496 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500497 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100498 return NULL;
499 Py_INCREF(unicode);
500 return unicode;
501 }
502 else
503 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100504 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100505}
506
Victor Stinner3a50e702011-10-18 21:21:00 +0200507#ifdef HAVE_MBCS
508static OSVERSIONINFOEX winver;
509#endif
510
Thomas Wouters477c8d52006-05-27 19:21:47 +0000511/* --- Bloom Filters ----------------------------------------------------- */
512
513/* stuff to implement simple "bloom filters" for Unicode characters.
514 to keep things simple, we use a single bitmask, using the least 5
515 bits from each unicode characters as the bit index. */
516
517/* the linebreak mask is set up by Unicode_Init below */
518
Antoine Pitrouf068f942010-01-13 14:19:12 +0000519#if LONG_BIT >= 128
520#define BLOOM_WIDTH 128
521#elif LONG_BIT >= 64
522#define BLOOM_WIDTH 64
523#elif LONG_BIT >= 32
524#define BLOOM_WIDTH 32
525#else
526#error "LONG_BIT is smaller than 32"
527#endif
528
Thomas Wouters477c8d52006-05-27 19:21:47 +0000529#define BLOOM_MASK unsigned long
530
531static BLOOM_MASK bloom_linebreak;
532
Antoine Pitrouf068f942010-01-13 14:19:12 +0000533#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
534#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536#define BLOOM_LINEBREAK(ch) \
537 ((ch) < 128U ? ascii_linebreak[(ch)] : \
538 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539
Alexander Belopolsky40018472011-02-26 01:02:56 +0000540Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200541make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542{
543 /* calculate simple bloom-style bitmask for a given unicode string */
544
Antoine Pitrouf068f942010-01-13 14:19:12 +0000545 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546 Py_ssize_t i;
547
548 mask = 0;
549 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
552 return mask;
553}
554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555#define BLOOM_MEMBER(mask, chr, str) \
556 (BLOOM(mask, chr) \
557 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000558
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200559/* Compilation of templated routines */
560
561#include "stringlib/asciilib.h"
562#include "stringlib/fastsearch.h"
563#include "stringlib/partition.h"
564#include "stringlib/split.h"
565#include "stringlib/count.h"
566#include "stringlib/find.h"
567#include "stringlib/find_max_char.h"
568#include "stringlib/localeutil.h"
569#include "stringlib/undef.h"
570
571#include "stringlib/ucs1lib.h"
572#include "stringlib/fastsearch.h"
573#include "stringlib/partition.h"
574#include "stringlib/split.h"
575#include "stringlib/count.h"
576#include "stringlib/find.h"
577#include "stringlib/find_max_char.h"
578#include "stringlib/localeutil.h"
579#include "stringlib/undef.h"
580
581#include "stringlib/ucs2lib.h"
582#include "stringlib/fastsearch.h"
583#include "stringlib/partition.h"
584#include "stringlib/split.h"
585#include "stringlib/count.h"
586#include "stringlib/find.h"
587#include "stringlib/find_max_char.h"
588#include "stringlib/localeutil.h"
589#include "stringlib/undef.h"
590
591#include "stringlib/ucs4lib.h"
592#include "stringlib/fastsearch.h"
593#include "stringlib/partition.h"
594#include "stringlib/split.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
597#include "stringlib/find_max_char.h"
598#include "stringlib/localeutil.h"
599#include "stringlib/undef.h"
600
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200601#include "stringlib/unicodedefs.h"
602#include "stringlib/fastsearch.h"
603#include "stringlib/count.h"
604#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100605#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200606
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607/* --- Unicode Object ----------------------------------------------------- */
608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200609static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200610fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
613 Py_ssize_t size, Py_UCS4 ch,
614 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200616 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
617
618 switch (kind) {
619 case PyUnicode_1BYTE_KIND:
620 {
621 Py_UCS1 ch1 = (Py_UCS1) ch;
622 if (ch1 == ch)
623 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
624 else
625 return -1;
626 }
627 case PyUnicode_2BYTE_KIND:
628 {
629 Py_UCS2 ch2 = (Py_UCS2) ch;
630 if (ch2 == ch)
631 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
632 else
633 return -1;
634 }
635 case PyUnicode_4BYTE_KIND:
636 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
637 default:
638 assert(0);
639 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641}
642
Victor Stinnerafffce42012-10-03 23:03:17 +0200643#ifdef Py_DEBUG
644/* Fill the data of an Unicode string with invalid characters to detect bugs
645 earlier.
646
647 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
648 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
649 invalid character in Unicode 6.0. */
650static void
651unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
652{
653 int kind = PyUnicode_KIND(unicode);
654 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
655 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
656 if (length <= old_length)
657 return;
658 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
659}
660#endif
661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662static PyObject*
663resize_compact(PyObject *unicode, Py_ssize_t length)
664{
665 Py_ssize_t char_size;
666 Py_ssize_t struct_size;
667 Py_ssize_t new_size;
668 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100669 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200670#ifdef Py_DEBUG
671 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
672#endif
673
Victor Stinner79891572012-05-03 13:43:07 +0200674 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200675 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100676 assert(PyUnicode_IS_COMPACT(unicode));
677
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200678 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100679 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680 struct_size = sizeof(PyASCIIObject);
681 else
682 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200683 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684
Victor Stinnerfe226c02011-10-03 03:52:20 +0200685 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
686 PyErr_NoMemory();
687 return NULL;
688 }
689 new_size = (struct_size + (length + 1) * char_size);
690
Victor Stinner84def372011-12-11 20:04:56 +0100691 _Py_DEC_REFTOTAL;
692 _Py_ForgetReference(unicode);
693
694 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
695 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100696 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 PyErr_NoMemory();
698 return NULL;
699 }
Victor Stinner84def372011-12-11 20:04:56 +0100700 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200701 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100702
Victor Stinnerfe226c02011-10-03 03:52:20 +0200703 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200704 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200705 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100706 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200707 _PyUnicode_WSTR_LENGTH(unicode) = length;
708 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200709#ifdef Py_DEBUG
710 unicode_fill_invalid(unicode, old_length);
711#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200712 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
713 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200714 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715 return unicode;
716}
717
Alexander Belopolsky40018472011-02-26 01:02:56 +0000718static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200719resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000720{
Victor Stinner95663112011-10-04 01:03:50 +0200721 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100722 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200723 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200724 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000725
Victor Stinnerfe226c02011-10-03 03:52:20 +0200726 if (PyUnicode_IS_READY(unicode)) {
727 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200728 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200729 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200730#ifdef Py_DEBUG
731 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
732#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200733
734 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200735 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200736 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
737 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738
739 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
740 PyErr_NoMemory();
741 return -1;
742 }
743 new_size = (length + 1) * char_size;
744
Victor Stinner7a9105a2011-12-12 00:13:42 +0100745 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
746 {
747 PyObject_DEL(_PyUnicode_UTF8(unicode));
748 _PyUnicode_UTF8(unicode) = NULL;
749 _PyUnicode_UTF8_LENGTH(unicode) = 0;
750 }
751
Victor Stinnerfe226c02011-10-03 03:52:20 +0200752 data = (PyObject *)PyObject_REALLOC(data, new_size);
753 if (data == NULL) {
754 PyErr_NoMemory();
755 return -1;
756 }
757 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200758 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200760 _PyUnicode_WSTR_LENGTH(unicode) = length;
761 }
762 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200763 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200764 _PyUnicode_UTF8_LENGTH(unicode) = length;
765 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 _PyUnicode_LENGTH(unicode) = length;
767 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200768#ifdef Py_DEBUG
769 unicode_fill_invalid(unicode, old_length);
770#endif
Victor Stinner95663112011-10-04 01:03:50 +0200771 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200772 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200775 }
Victor Stinner95663112011-10-04 01:03:50 +0200776 assert(_PyUnicode_WSTR(unicode) != NULL);
777
778 /* check for integer overflow */
779 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
780 PyErr_NoMemory();
781 return -1;
782 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100783 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200784 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200786 if (!wstr) {
787 PyErr_NoMemory();
788 return -1;
789 }
790 _PyUnicode_WSTR(unicode) = wstr;
791 _PyUnicode_WSTR(unicode)[length] = 0;
792 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200793 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000794 return 0;
795}
796
Victor Stinnerfe226c02011-10-03 03:52:20 +0200797static PyObject*
798resize_copy(PyObject *unicode, Py_ssize_t length)
799{
800 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100801 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200802 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100803
Benjamin Petersonbac79492012-01-14 13:34:47 -0500804 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100805 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200806
807 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
808 if (copy == NULL)
809 return NULL;
810
811 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200812 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200814 }
815 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200816 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100817
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200818 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200819 if (w == NULL)
820 return NULL;
821 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
822 copy_length = Py_MIN(copy_length, length);
823 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
824 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200825 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200826 }
827}
828
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000830 Ux0000 terminated; some code (e.g. new_identifier)
831 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000832
833 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000834 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000835
836*/
837
Alexander Belopolsky40018472011-02-26 01:02:56 +0000838static PyUnicodeObject *
839_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840{
841 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000843
Thomas Wouters477c8d52006-05-27 19:21:47 +0000844 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000845 if (length == 0 && unicode_empty != NULL) {
846 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200847 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000848 }
849
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000850 /* Ensure we won't overflow the size. */
851 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
852 return (PyUnicodeObject *)PyErr_NoMemory();
853 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200854 if (length < 0) {
855 PyErr_SetString(PyExc_SystemError,
856 "Negative size passed to _PyUnicode_New");
857 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000858 }
859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200860 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
861 if (unicode == NULL)
862 return NULL;
863 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
864 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
865 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100866 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000867 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100868 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000869 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200870
Jeremy Hyltond8082792003-09-16 19:41:39 +0000871 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000872 * the caller fails before initializing str -- unicode_resize()
873 * reads str[0], and the Keep-Alive optimization can keep memory
874 * allocated for str alive across a call to unicode_dealloc(unicode).
875 * We don't want unicode_resize to read uninitialized memory in
876 * that case.
877 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200878 _PyUnicode_WSTR(unicode)[0] = 0;
879 _PyUnicode_WSTR(unicode)[length] = 0;
880 _PyUnicode_WSTR_LENGTH(unicode) = length;
881 _PyUnicode_HASH(unicode) = -1;
882 _PyUnicode_STATE(unicode).interned = 0;
883 _PyUnicode_STATE(unicode).kind = 0;
884 _PyUnicode_STATE(unicode).compact = 0;
885 _PyUnicode_STATE(unicode).ready = 0;
886 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200887 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200888 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200889 _PyUnicode_UTF8(unicode) = NULL;
890 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100891 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000892 return unicode;
893}
894
Victor Stinnerf42dc442011-10-02 23:33:16 +0200895static const char*
896unicode_kind_name(PyObject *unicode)
897{
Victor Stinner42dfd712011-10-03 14:41:45 +0200898 /* don't check consistency: unicode_kind_name() is called from
899 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200900 if (!PyUnicode_IS_COMPACT(unicode))
901 {
902 if (!PyUnicode_IS_READY(unicode))
903 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600904 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200905 {
906 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200907 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200908 return "legacy ascii";
909 else
910 return "legacy latin1";
911 case PyUnicode_2BYTE_KIND:
912 return "legacy UCS2";
913 case PyUnicode_4BYTE_KIND:
914 return "legacy UCS4";
915 default:
916 return "<legacy invalid kind>";
917 }
918 }
919 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600920 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200921 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200922 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200923 return "ascii";
924 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200925 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200926 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200927 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200928 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200929 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200930 default:
931 return "<invalid compact kind>";
932 }
933}
934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200935#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200936/* Functions wrapping macros for use in debugger */
937char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200938 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200939}
940
941void *_PyUnicode_compact_data(void *unicode) {
942 return _PyUnicode_COMPACT_DATA(unicode);
943}
944void *_PyUnicode_data(void *unicode){
945 printf("obj %p\n", unicode);
946 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
947 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
948 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
949 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
950 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
951 return PyUnicode_DATA(unicode);
952}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200953
954void
955_PyUnicode_Dump(PyObject *op)
956{
957 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200958 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
959 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
960 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200961
Victor Stinnera849a4b2011-10-03 12:12:11 +0200962 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200963 {
964 if (ascii->state.ascii)
965 data = (ascii + 1);
966 else
967 data = (compact + 1);
968 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200969 else
970 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200971 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
972
Victor Stinnera849a4b2011-10-03 12:12:11 +0200973 if (ascii->wstr == data)
974 printf("shared ");
975 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200976
Victor Stinnera3b334d2011-10-03 13:53:37 +0200977 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200978 printf(" (%zu), ", compact->wstr_length);
979 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
980 printf("shared ");
981 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200982 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200983 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200984}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985#endif
986
987PyObject *
988PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
989{
990 PyObject *obj;
991 PyCompactUnicodeObject *unicode;
992 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200993 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200994 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995 Py_ssize_t char_size;
996 Py_ssize_t struct_size;
997
998 /* Optimization for empty strings */
999 if (size == 0 && unicode_empty != NULL) {
1000 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001001 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001002 }
1003
Victor Stinner9e9d6892011-10-04 01:02:02 +02001004 is_ascii = 0;
1005 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001006 struct_size = sizeof(PyCompactUnicodeObject);
1007 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001008 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009 char_size = 1;
1010 is_ascii = 1;
1011 struct_size = sizeof(PyASCIIObject);
1012 }
1013 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001014 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001015 char_size = 1;
1016 }
1017 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001018 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001019 char_size = 2;
1020 if (sizeof(wchar_t) == 2)
1021 is_sharing = 1;
1022 }
1023 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001024 if (maxchar > MAX_UNICODE) {
1025 PyErr_SetString(PyExc_SystemError,
1026 "invalid maximum character passed to PyUnicode_New");
1027 return NULL;
1028 }
Victor Stinner8f825062012-04-27 13:55:39 +02001029 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030 char_size = 4;
1031 if (sizeof(wchar_t) == 4)
1032 is_sharing = 1;
1033 }
1034
1035 /* Ensure we won't overflow the size. */
1036 if (size < 0) {
1037 PyErr_SetString(PyExc_SystemError,
1038 "Negative size passed to PyUnicode_New");
1039 return NULL;
1040 }
1041 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1042 return PyErr_NoMemory();
1043
1044 /* Duplicated allocation code from _PyObject_New() instead of a call to
1045 * PyObject_New() so we are able to allocate space for the object and
1046 * it's data buffer.
1047 */
1048 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1049 if (obj == NULL)
1050 return PyErr_NoMemory();
1051 obj = PyObject_INIT(obj, &PyUnicode_Type);
1052 if (obj == NULL)
1053 return NULL;
1054
1055 unicode = (PyCompactUnicodeObject *)obj;
1056 if (is_ascii)
1057 data = ((PyASCIIObject*)obj) + 1;
1058 else
1059 data = unicode + 1;
1060 _PyUnicode_LENGTH(unicode) = size;
1061 _PyUnicode_HASH(unicode) = -1;
1062 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001063 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 _PyUnicode_STATE(unicode).compact = 1;
1065 _PyUnicode_STATE(unicode).ready = 1;
1066 _PyUnicode_STATE(unicode).ascii = is_ascii;
1067 if (is_ascii) {
1068 ((char*)data)[size] = 0;
1069 _PyUnicode_WSTR(unicode) = NULL;
1070 }
Victor Stinner8f825062012-04-27 13:55:39 +02001071 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 ((char*)data)[size] = 0;
1073 _PyUnicode_WSTR(unicode) = NULL;
1074 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001076 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078 else {
1079 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001080 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001081 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001083 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001084 ((Py_UCS4*)data)[size] = 0;
1085 if (is_sharing) {
1086 _PyUnicode_WSTR_LENGTH(unicode) = size;
1087 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1088 }
1089 else {
1090 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1091 _PyUnicode_WSTR(unicode) = NULL;
1092 }
1093 }
Victor Stinner8f825062012-04-27 13:55:39 +02001094#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001095 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001096#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001097 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001098 return obj;
1099}
1100
1101#if SIZEOF_WCHAR_T == 2
1102/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1103 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001104 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105
1106 This function assumes that unicode can hold one more code point than wstr
1107 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001108static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001110 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111{
1112 const wchar_t *iter;
1113 Py_UCS4 *ucs4_out;
1114
Victor Stinner910337b2011-10-03 03:20:16 +02001115 assert(unicode != NULL);
1116 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1118 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1119
1120 for (iter = begin; iter < end; ) {
1121 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1122 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001123 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1124 && (iter+1) < end
1125 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126 {
Victor Stinner551ac952011-11-29 22:58:13 +01001127 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128 iter += 2;
1129 }
1130 else {
1131 *ucs4_out++ = *iter;
1132 iter++;
1133 }
1134 }
1135 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1136 _PyUnicode_GET_LENGTH(unicode)));
1137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138}
1139#endif
1140
Victor Stinnercd9950f2011-10-02 00:34:53 +02001141static int
Victor Stinner488fa492011-12-12 00:01:39 +01001142unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001143{
Victor Stinner488fa492011-12-12 00:01:39 +01001144 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001145 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001146 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001147 return -1;
1148 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001149 return 0;
1150}
1151
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152static int
1153_copy_characters(PyObject *to, Py_ssize_t to_start,
1154 PyObject *from, Py_ssize_t from_start,
1155 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001157 unsigned int from_kind, to_kind;
1158 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159
Victor Stinneree4544c2012-05-09 22:24:08 +02001160 assert(0 <= how_many);
1161 assert(0 <= from_start);
1162 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001163 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001164 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001165 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166
Victor Stinnerd3f08822012-05-29 12:57:52 +02001167 assert(PyUnicode_Check(to));
1168 assert(PyUnicode_IS_READY(to));
1169 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1170
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001171 if (how_many == 0)
1172 return 0;
1173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001174 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001175 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001177 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178
Victor Stinnerf1852262012-06-16 16:38:26 +02001179#ifdef Py_DEBUG
1180 if (!check_maxchar
1181 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1182 {
1183 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1184 Py_UCS4 ch;
1185 Py_ssize_t i;
1186 for (i=0; i < how_many; i++) {
1187 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1188 assert(ch <= to_maxchar);
1189 }
1190 }
1191#endif
1192
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001193 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001194 if (check_maxchar
1195 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1196 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001197 /* Writing Latin-1 characters into an ASCII string requires to
1198 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001199 Py_UCS4 max_char;
1200 max_char = ucs1lib_find_max_char(from_data,
1201 (Py_UCS1*)from_data + how_many);
1202 if (max_char >= 128)
1203 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001204 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001205 Py_MEMCPY((char*)to_data + to_kind * to_start,
1206 (char*)from_data + from_kind * from_start,
1207 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001209 else if (from_kind == PyUnicode_1BYTE_KIND
1210 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001211 {
1212 _PyUnicode_CONVERT_BYTES(
1213 Py_UCS1, Py_UCS2,
1214 PyUnicode_1BYTE_DATA(from) + from_start,
1215 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1216 PyUnicode_2BYTE_DATA(to) + to_start
1217 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001218 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001219 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001220 && to_kind == PyUnicode_4BYTE_KIND)
1221 {
1222 _PyUnicode_CONVERT_BYTES(
1223 Py_UCS1, Py_UCS4,
1224 PyUnicode_1BYTE_DATA(from) + from_start,
1225 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1226 PyUnicode_4BYTE_DATA(to) + to_start
1227 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001228 }
1229 else if (from_kind == PyUnicode_2BYTE_KIND
1230 && to_kind == PyUnicode_4BYTE_KIND)
1231 {
1232 _PyUnicode_CONVERT_BYTES(
1233 Py_UCS2, Py_UCS4,
1234 PyUnicode_2BYTE_DATA(from) + from_start,
1235 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1236 PyUnicode_4BYTE_DATA(to) + to_start
1237 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001238 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001239 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001240 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1241
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001242 if (!check_maxchar) {
1243 if (from_kind == PyUnicode_2BYTE_KIND
1244 && to_kind == PyUnicode_1BYTE_KIND)
1245 {
1246 _PyUnicode_CONVERT_BYTES(
1247 Py_UCS2, Py_UCS1,
1248 PyUnicode_2BYTE_DATA(from) + from_start,
1249 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1250 PyUnicode_1BYTE_DATA(to) + to_start
1251 );
1252 }
1253 else if (from_kind == PyUnicode_4BYTE_KIND
1254 && to_kind == PyUnicode_1BYTE_KIND)
1255 {
1256 _PyUnicode_CONVERT_BYTES(
1257 Py_UCS4, Py_UCS1,
1258 PyUnicode_4BYTE_DATA(from) + from_start,
1259 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1260 PyUnicode_1BYTE_DATA(to) + to_start
1261 );
1262 }
1263 else if (from_kind == PyUnicode_4BYTE_KIND
1264 && to_kind == PyUnicode_2BYTE_KIND)
1265 {
1266 _PyUnicode_CONVERT_BYTES(
1267 Py_UCS4, Py_UCS2,
1268 PyUnicode_4BYTE_DATA(from) + from_start,
1269 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1270 PyUnicode_2BYTE_DATA(to) + to_start
1271 );
1272 }
1273 else {
1274 assert(0);
1275 return -1;
1276 }
1277 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001278 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001279 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001280 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001281 Py_ssize_t i;
1282
Victor Stinnera0702ab2011-09-29 14:14:38 +02001283 for (i=0; i < how_many; i++) {
1284 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 if (ch > to_maxchar)
1286 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001287 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1288 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001289 }
1290 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001291 return 0;
1292}
1293
Victor Stinnerd3f08822012-05-29 12:57:52 +02001294void
1295_PyUnicode_FastCopyCharacters(
1296 PyObject *to, Py_ssize_t to_start,
1297 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001298{
1299 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1300}
1301
1302Py_ssize_t
1303PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1304 PyObject *from, Py_ssize_t from_start,
1305 Py_ssize_t how_many)
1306{
1307 int err;
1308
1309 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1310 PyErr_BadInternalCall();
1311 return -1;
1312 }
1313
Benjamin Petersonbac79492012-01-14 13:34:47 -05001314 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001315 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001316 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001317 return -1;
1318
Victor Stinnerd3f08822012-05-29 12:57:52 +02001319 if (from_start < 0) {
1320 PyErr_SetString(PyExc_IndexError, "string index out of range");
1321 return -1;
1322 }
1323 if (to_start < 0) {
1324 PyErr_SetString(PyExc_IndexError, "string index out of range");
1325 return -1;
1326 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001327 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1328 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1329 PyErr_Format(PyExc_SystemError,
1330 "Cannot write %zi characters at %zi "
1331 "in a string of %zi characters",
1332 how_many, to_start, PyUnicode_GET_LENGTH(to));
1333 return -1;
1334 }
1335
1336 if (how_many == 0)
1337 return 0;
1338
Victor Stinner488fa492011-12-12 00:01:39 +01001339 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001340 return -1;
1341
1342 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1343 if (err) {
1344 PyErr_Format(PyExc_SystemError,
1345 "Cannot copy %s characters "
1346 "into a string of %s characters",
1347 unicode_kind_name(from),
1348 unicode_kind_name(to));
1349 return -1;
1350 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001351 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001352}
1353
Victor Stinner17222162011-09-28 22:15:37 +02001354/* Find the maximum code point and count the number of surrogate pairs so a
1355 correct string length can be computed before converting a string to UCS4.
1356 This function counts single surrogates as a character and not as a pair.
1357
1358 Return 0 on success, or -1 on error. */
1359static int
1360find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1361 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362{
1363 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001364 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365
Victor Stinnerc53be962011-10-02 21:33:54 +02001366 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 *num_surrogates = 0;
1368 *maxchar = 0;
1369
1370 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001372 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1373 && (iter+1) < end
1374 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001376 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378 iter += 2;
1379 }
1380 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001382 {
1383 ch = *iter;
1384 iter++;
1385 }
1386 if (ch > *maxchar) {
1387 *maxchar = ch;
1388 if (*maxchar > MAX_UNICODE) {
1389 PyErr_Format(PyExc_ValueError,
1390 "character U+%x is not in range [U+0000; U+10ffff]",
1391 ch);
1392 return -1;
1393 }
1394 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 }
1396 return 0;
1397}
1398
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001399int
1400_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401{
1402 wchar_t *end;
1403 Py_UCS4 maxchar = 0;
1404 Py_ssize_t num_surrogates;
1405#if SIZEOF_WCHAR_T == 2
1406 Py_ssize_t length_wo_surrogates;
1407#endif
1408
Georg Brandl7597add2011-10-05 16:36:47 +02001409 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001410 strings were created using _PyObject_New() and where no canonical
1411 representation (the str field) has been set yet aka strings
1412 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001413 assert(_PyUnicode_CHECK(unicode));
1414 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001416 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001417 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001418 /* Actually, it should neither be interned nor be anything else: */
1419 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001422 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001423 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425
1426 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001427 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1428 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429 PyErr_NoMemory();
1430 return -1;
1431 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001432 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 _PyUnicode_WSTR(unicode), end,
1434 PyUnicode_1BYTE_DATA(unicode));
1435 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1436 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1437 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1438 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001439 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001440 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001441 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 }
1443 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001444 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001445 _PyUnicode_UTF8(unicode) = NULL;
1446 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 }
1448 PyObject_FREE(_PyUnicode_WSTR(unicode));
1449 _PyUnicode_WSTR(unicode) = NULL;
1450 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1451 }
1452 /* In this case we might have to convert down from 4-byte native
1453 wchar_t to 2-byte unicode. */
1454 else if (maxchar < 65536) {
1455 assert(num_surrogates == 0 &&
1456 "FindMaxCharAndNumSurrogatePairs() messed up");
1457
Victor Stinner506f5922011-09-28 22:34:18 +02001458#if SIZEOF_WCHAR_T == 2
1459 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001460 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001461 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1462 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1463 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001464 _PyUnicode_UTF8(unicode) = NULL;
1465 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001466#else
1467 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001468 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001469 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001471 PyErr_NoMemory();
1472 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 }
Victor Stinner506f5922011-09-28 22:34:18 +02001474 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1475 _PyUnicode_WSTR(unicode), end,
1476 PyUnicode_2BYTE_DATA(unicode));
1477 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1478 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1479 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001480 _PyUnicode_UTF8(unicode) = NULL;
1481 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001482 PyObject_FREE(_PyUnicode_WSTR(unicode));
1483 _PyUnicode_WSTR(unicode) = NULL;
1484 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1485#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486 }
1487 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1488 else {
1489#if SIZEOF_WCHAR_T == 2
1490 /* in case the native representation is 2-bytes, we need to allocate a
1491 new normalized 4-byte version. */
1492 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001493 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1494 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495 PyErr_NoMemory();
1496 return -1;
1497 }
1498 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1499 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001500 _PyUnicode_UTF8(unicode) = NULL;
1501 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001502 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1503 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001504 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001505 PyObject_FREE(_PyUnicode_WSTR(unicode));
1506 _PyUnicode_WSTR(unicode) = NULL;
1507 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1508#else
1509 assert(num_surrogates == 0);
1510
Victor Stinnerc3c74152011-10-02 20:39:55 +02001511 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001512 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001513 _PyUnicode_UTF8(unicode) = NULL;
1514 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1516#endif
1517 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1518 }
1519 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001520 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521 return 0;
1522}
1523
Alexander Belopolsky40018472011-02-26 01:02:56 +00001524static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001525unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526{
Walter Dörwald16807132007-05-25 13:52:07 +00001527 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001528 case SSTATE_NOT_INTERNED:
1529 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001530
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 case SSTATE_INTERNED_MORTAL:
1532 /* revive dead object temporarily for DelItem */
1533 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001534 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001535 Py_FatalError(
1536 "deletion of interned string failed");
1537 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001538
Benjamin Peterson29060642009-01-31 22:14:21 +00001539 case SSTATE_INTERNED_IMMORTAL:
1540 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001541
Benjamin Peterson29060642009-01-31 22:14:21 +00001542 default:
1543 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001544 }
1545
Victor Stinner03490912011-10-03 23:45:12 +02001546 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001548 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001549 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001550 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1551 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001552
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001553 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554}
1555
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001556#ifdef Py_DEBUG
1557static int
1558unicode_is_singleton(PyObject *unicode)
1559{
1560 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1561 if (unicode == unicode_empty)
1562 return 1;
1563 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1564 {
1565 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1566 if (ch < 256 && unicode_latin1[ch] == unicode)
1567 return 1;
1568 }
1569 return 0;
1570}
1571#endif
1572
Alexander Belopolsky40018472011-02-26 01:02:56 +00001573static int
Victor Stinner488fa492011-12-12 00:01:39 +01001574unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001575{
Victor Stinner488fa492011-12-12 00:01:39 +01001576 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001577 if (Py_REFCNT(unicode) != 1)
1578 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001579 if (_PyUnicode_HASH(unicode) != -1)
1580 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 if (PyUnicode_CHECK_INTERNED(unicode))
1582 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001583 if (!PyUnicode_CheckExact(unicode))
1584 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001585#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001586 /* singleton refcount is greater than 1 */
1587 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001588#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001589 return 1;
1590}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001591
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592static int
1593unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1594{
1595 PyObject *unicode;
1596 Py_ssize_t old_length;
1597
1598 assert(p_unicode != NULL);
1599 unicode = *p_unicode;
1600
1601 assert(unicode != NULL);
1602 assert(PyUnicode_Check(unicode));
1603 assert(0 <= length);
1604
Victor Stinner910337b2011-10-03 03:20:16 +02001605 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001606 old_length = PyUnicode_WSTR_LENGTH(unicode);
1607 else
1608 old_length = PyUnicode_GET_LENGTH(unicode);
1609 if (old_length == length)
1610 return 0;
1611
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001612 if (length == 0) {
1613 Py_DECREF(*p_unicode);
1614 *p_unicode = unicode_empty;
1615 Py_INCREF(*p_unicode);
1616 return 0;
1617 }
1618
Victor Stinner488fa492011-12-12 00:01:39 +01001619 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 PyObject *copy = resize_copy(unicode, length);
1621 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001622 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 Py_DECREF(*p_unicode);
1624 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001625 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001626 }
1627
Victor Stinnerfe226c02011-10-03 03:52:20 +02001628 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001629 PyObject *new_unicode = resize_compact(unicode, length);
1630 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001632 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001633 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001634 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001635 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001636}
1637
Alexander Belopolsky40018472011-02-26 01:02:56 +00001638int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001639PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001640{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001641 PyObject *unicode;
1642 if (p_unicode == NULL) {
1643 PyErr_BadInternalCall();
1644 return -1;
1645 }
1646 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001647 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001648 {
1649 PyErr_BadInternalCall();
1650 return -1;
1651 }
1652 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001653}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001654
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001656unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1657 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001658{
1659 PyObject *result;
1660 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001661 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001662 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1663 return 0;
1664 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1665 maxchar);
1666 if (result == NULL)
1667 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001668 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001669 Py_DECREF(*p_unicode);
1670 *p_unicode = result;
1671 return 0;
1672}
1673
1674static int
1675unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1676 Py_UCS4 ch)
1677{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001678 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001679 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001680 return -1;
1681 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1682 PyUnicode_DATA(*p_unicode),
1683 (*pos)++, ch);
1684 return 0;
1685}
1686
Victor Stinnerc5166102012-02-22 13:55:02 +01001687/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001688
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001689 WARNING: The function doesn't copy the terminating null character and
1690 doesn't check the maximum character (may write a latin1 character in an
1691 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001692static void
1693unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1694 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001695{
1696 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1697 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001698 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001699
1700 switch (kind) {
1701 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001702 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001703#ifdef Py_DEBUG
1704 if (PyUnicode_IS_ASCII(unicode)) {
1705 Py_UCS4 maxchar = ucs1lib_find_max_char(
1706 (const Py_UCS1*)str,
1707 (const Py_UCS1*)str + len);
1708 assert(maxchar < 128);
1709 }
1710#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001711 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001712 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001713 }
1714 case PyUnicode_2BYTE_KIND: {
1715 Py_UCS2 *start = (Py_UCS2 *)data + index;
1716 Py_UCS2 *ucs2 = start;
1717 assert(index <= PyUnicode_GET_LENGTH(unicode));
1718
Victor Stinner184252a2012-06-16 02:57:41 +02001719 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001720 *ucs2 = (Py_UCS2)*str;
1721
1722 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001723 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001724 }
1725 default: {
1726 Py_UCS4 *start = (Py_UCS4 *)data + index;
1727 Py_UCS4 *ucs4 = start;
1728 assert(kind == PyUnicode_4BYTE_KIND);
1729 assert(index <= PyUnicode_GET_LENGTH(unicode));
1730
Victor Stinner184252a2012-06-16 02:57:41 +02001731 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001732 *ucs4 = (Py_UCS4)*str;
1733
1734 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001735 }
1736 }
1737}
1738
1739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740static PyObject*
1741get_latin1_char(unsigned char ch)
1742{
Victor Stinnera464fc12011-10-02 20:39:30 +02001743 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001745 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746 if (!unicode)
1747 return NULL;
1748 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001749 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 unicode_latin1[ch] = unicode;
1751 }
1752 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001753 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754}
1755
Alexander Belopolsky40018472011-02-26 01:02:56 +00001756PyObject *
1757PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001759 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 Py_UCS4 maxchar = 0;
1761 Py_ssize_t num_surrogates;
1762
1763 if (u == NULL)
1764 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001766 /* If the Unicode data is known at construction time, we can apply
1767 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 /* Optimization for empty strings */
1770 if (size == 0 && unicode_empty != NULL) {
1771 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001772 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001773 }
Tim Petersced69f82003-09-16 20:30:58 +00001774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 /* Single character Unicode objects in the Latin-1 range are
1776 shared when using this constructor */
1777 if (size == 1 && *u < 256)
1778 return get_latin1_char((unsigned char)*u);
1779
1780 /* If not empty and not single character, copy the Unicode data
1781 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001782 if (find_maxchar_surrogates(u, u + size,
1783 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 return NULL;
1785
Victor Stinner8faf8212011-12-08 22:14:11 +01001786 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 if (!unicode)
1788 return NULL;
1789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 switch (PyUnicode_KIND(unicode)) {
1791 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001792 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1794 break;
1795 case PyUnicode_2BYTE_KIND:
1796#if Py_UNICODE_SIZE == 2
1797 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1798#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001799 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1801#endif
1802 break;
1803 case PyUnicode_4BYTE_KIND:
1804#if SIZEOF_WCHAR_T == 2
1805 /* This is the only case which has to process surrogates, thus
1806 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001807 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808#else
1809 assert(num_surrogates == 0);
1810 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1811#endif
1812 break;
1813 default:
1814 assert(0 && "Impossible state");
1815 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001817 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001818}
1819
Alexander Belopolsky40018472011-02-26 01:02:56 +00001820PyObject *
1821PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001822{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001823 if (size < 0) {
1824 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001825 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001826 return NULL;
1827 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001828 if (u != NULL)
1829 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1830 else
1831 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001832}
1833
Alexander Belopolsky40018472011-02-26 01:02:56 +00001834PyObject *
1835PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001836{
1837 size_t size = strlen(u);
1838 if (size > PY_SSIZE_T_MAX) {
1839 PyErr_SetString(PyExc_OverflowError, "input too long");
1840 return NULL;
1841 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001842 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001843}
1844
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001845PyObject *
1846_PyUnicode_FromId(_Py_Identifier *id)
1847{
1848 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001849 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1850 strlen(id->string),
1851 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001852 if (!id->object)
1853 return NULL;
1854 PyUnicode_InternInPlace(&id->object);
1855 assert(!id->next);
1856 id->next = static_strings;
1857 static_strings = id;
1858 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001859 return id->object;
1860}
1861
1862void
1863_PyUnicode_ClearStaticStrings()
1864{
1865 _Py_Identifier *i;
1866 for (i = static_strings; i; i = i->next) {
1867 Py_DECREF(i->object);
1868 i->object = NULL;
1869 i->next = NULL;
1870 }
1871}
1872
Benjamin Peterson0df54292012-03-26 14:50:32 -04001873/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001874
Victor Stinnerd3f08822012-05-29 12:57:52 +02001875PyObject*
1876_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001877{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001878 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001879 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001880 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001881#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001882 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001883#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001884 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001885 }
Victor Stinner785938e2011-12-11 20:09:03 +01001886 unicode = PyUnicode_New(size, 127);
1887 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001888 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001889 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1890 assert(_PyUnicode_CheckConsistency(unicode, 1));
1891 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001892}
1893
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001894static Py_UCS4
1895kind_maxchar_limit(unsigned int kind)
1896{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001897 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001898 case PyUnicode_1BYTE_KIND:
1899 return 0x80;
1900 case PyUnicode_2BYTE_KIND:
1901 return 0x100;
1902 case PyUnicode_4BYTE_KIND:
1903 return 0x10000;
1904 default:
1905 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001906 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001907 }
1908}
1909
Victor Stinnere6abb482012-05-02 01:15:40 +02001910Py_LOCAL_INLINE(Py_UCS4)
1911align_maxchar(Py_UCS4 maxchar)
1912{
1913 if (maxchar <= 127)
1914 return 127;
1915 else if (maxchar <= 255)
1916 return 255;
1917 else if (maxchar <= 65535)
1918 return 65535;
1919 else
1920 return MAX_UNICODE;
1921}
1922
Victor Stinner702c7342011-10-05 13:50:52 +02001923static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001924_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001927 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001928
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001929 if (size == 0) {
1930 Py_INCREF(unicode_empty);
1931 return unicode_empty;
1932 }
1933 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001934 if (size == 1)
1935 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001936
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001937 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001938 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 if (!res)
1940 return NULL;
1941 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001942 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001944}
1945
Victor Stinnere57b1c02011-09-28 22:20:48 +02001946static PyObject*
1947_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948{
1949 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001950 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001951
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001952 if (size == 0) {
1953 Py_INCREF(unicode_empty);
1954 return unicode_empty;
1955 }
1956 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001957 if (size == 1) {
1958 Py_UCS4 ch = u[0];
1959 if (ch < 256)
1960 return get_latin1_char((unsigned char)ch);
1961
1962 res = PyUnicode_New(1, ch);
1963 if (res == NULL)
1964 return NULL;
1965 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1966 assert(_PyUnicode_CheckConsistency(res, 1));
1967 return res;
1968 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001969
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001970 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001971 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001972 if (!res)
1973 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001974 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001975 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001976 else {
1977 _PyUnicode_CONVERT_BYTES(
1978 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1979 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001980 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 return res;
1982}
1983
Victor Stinnere57b1c02011-09-28 22:20:48 +02001984static PyObject*
1985_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986{
1987 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001988 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001989
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001990 if (size == 0) {
1991 Py_INCREF(unicode_empty);
1992 return unicode_empty;
1993 }
1994 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001995 if (size == 1) {
1996 Py_UCS4 ch = u[0];
1997 if (ch < 256)
1998 return get_latin1_char((unsigned char)ch);
1999
2000 res = PyUnicode_New(1, ch);
2001 if (res == NULL)
2002 return NULL;
2003 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
2004 assert(_PyUnicode_CheckConsistency(res, 1));
2005 return res;
2006 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002007
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002008 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002009 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 if (!res)
2011 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002012 if (max_char < 256)
2013 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2014 PyUnicode_1BYTE_DATA(res));
2015 else if (max_char < 0x10000)
2016 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2017 PyUnicode_2BYTE_DATA(res));
2018 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002020 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 return res;
2022}
2023
2024PyObject*
2025PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2026{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002027 if (size < 0) {
2028 PyErr_SetString(PyExc_ValueError, "size must be positive");
2029 return NULL;
2030 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002031 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002033 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002035 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002037 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002038 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002039 PyErr_SetString(PyExc_SystemError, "invalid kind");
2040 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042}
2043
Victor Stinnerece58de2012-04-23 23:36:38 +02002044Py_UCS4
2045_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2046{
2047 enum PyUnicode_Kind kind;
2048 void *startptr, *endptr;
2049
2050 assert(PyUnicode_IS_READY(unicode));
2051 assert(0 <= start);
2052 assert(end <= PyUnicode_GET_LENGTH(unicode));
2053 assert(start <= end);
2054
2055 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2056 return PyUnicode_MAX_CHAR_VALUE(unicode);
2057
2058 if (start == end)
2059 return 127;
2060
Victor Stinner94d558b2012-04-27 22:26:58 +02002061 if (PyUnicode_IS_ASCII(unicode))
2062 return 127;
2063
Victor Stinnerece58de2012-04-23 23:36:38 +02002064 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002065 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002066 endptr = (char *)startptr + end * kind;
2067 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002068 switch(kind) {
2069 case PyUnicode_1BYTE_KIND:
2070 return ucs1lib_find_max_char(startptr, endptr);
2071 case PyUnicode_2BYTE_KIND:
2072 return ucs2lib_find_max_char(startptr, endptr);
2073 case PyUnicode_4BYTE_KIND:
2074 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002075 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002076 assert(0);
2077 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002078 }
2079}
2080
Victor Stinner25a4b292011-10-06 12:31:55 +02002081/* Ensure that a string uses the most efficient storage, if it is not the
2082 case: create a new string with of the right kind. Write NULL into *p_unicode
2083 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002084static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002085unicode_adjust_maxchar(PyObject **p_unicode)
2086{
2087 PyObject *unicode, *copy;
2088 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002089 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002090 unsigned int kind;
2091
2092 assert(p_unicode != NULL);
2093 unicode = *p_unicode;
2094 assert(PyUnicode_IS_READY(unicode));
2095 if (PyUnicode_IS_ASCII(unicode))
2096 return;
2097
2098 len = PyUnicode_GET_LENGTH(unicode);
2099 kind = PyUnicode_KIND(unicode);
2100 if (kind == PyUnicode_1BYTE_KIND) {
2101 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002102 max_char = ucs1lib_find_max_char(u, u + len);
2103 if (max_char >= 128)
2104 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002105 }
2106 else if (kind == PyUnicode_2BYTE_KIND) {
2107 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002108 max_char = ucs2lib_find_max_char(u, u + len);
2109 if (max_char >= 256)
2110 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002111 }
2112 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002113 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002114 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002115 max_char = ucs4lib_find_max_char(u, u + len);
2116 if (max_char >= 0x10000)
2117 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002118 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002119 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002120 if (copy != NULL)
2121 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002122 Py_DECREF(unicode);
2123 *p_unicode = copy;
2124}
2125
Victor Stinner034f6cf2011-09-30 02:26:44 +02002126PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002127_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002128{
Victor Stinner87af4f22011-11-21 23:03:47 +01002129 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002130 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002131
Victor Stinner034f6cf2011-09-30 02:26:44 +02002132 if (!PyUnicode_Check(unicode)) {
2133 PyErr_BadInternalCall();
2134 return NULL;
2135 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002136 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002137 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002138
Victor Stinner87af4f22011-11-21 23:03:47 +01002139 length = PyUnicode_GET_LENGTH(unicode);
2140 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002141 if (!copy)
2142 return NULL;
2143 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2144
Victor Stinner87af4f22011-11-21 23:03:47 +01002145 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2146 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002147 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002148 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002149}
2150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151
Victor Stinnerbc603d12011-10-02 01:00:40 +02002152/* Widen Unicode objects to larger buffers. Don't write terminating null
2153 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002154
2155void*
2156_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2157{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002158 Py_ssize_t len;
2159 void *result;
2160 unsigned int skind;
2161
Benjamin Petersonbac79492012-01-14 13:34:47 -05002162 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002163 return NULL;
2164
2165 len = PyUnicode_GET_LENGTH(s);
2166 skind = PyUnicode_KIND(s);
2167 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002168 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 return NULL;
2170 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002171 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002172 case PyUnicode_2BYTE_KIND:
2173 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2174 if (!result)
2175 return PyErr_NoMemory();
2176 assert(skind == PyUnicode_1BYTE_KIND);
2177 _PyUnicode_CONVERT_BYTES(
2178 Py_UCS1, Py_UCS2,
2179 PyUnicode_1BYTE_DATA(s),
2180 PyUnicode_1BYTE_DATA(s) + len,
2181 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002183 case PyUnicode_4BYTE_KIND:
2184 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2185 if (!result)
2186 return PyErr_NoMemory();
2187 if (skind == PyUnicode_2BYTE_KIND) {
2188 _PyUnicode_CONVERT_BYTES(
2189 Py_UCS2, Py_UCS4,
2190 PyUnicode_2BYTE_DATA(s),
2191 PyUnicode_2BYTE_DATA(s) + len,
2192 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002194 else {
2195 assert(skind == PyUnicode_1BYTE_KIND);
2196 _PyUnicode_CONVERT_BYTES(
2197 Py_UCS1, Py_UCS4,
2198 PyUnicode_1BYTE_DATA(s),
2199 PyUnicode_1BYTE_DATA(s) + len,
2200 result);
2201 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002202 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002203 default:
2204 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 }
Victor Stinner01698042011-10-04 00:04:26 +02002206 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002207 return NULL;
2208}
2209
2210static Py_UCS4*
2211as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2212 int copy_null)
2213{
2214 int kind;
2215 void *data;
2216 Py_ssize_t len, targetlen;
2217 if (PyUnicode_READY(string) == -1)
2218 return NULL;
2219 kind = PyUnicode_KIND(string);
2220 data = PyUnicode_DATA(string);
2221 len = PyUnicode_GET_LENGTH(string);
2222 targetlen = len;
2223 if (copy_null)
2224 targetlen++;
2225 if (!target) {
2226 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2227 PyErr_NoMemory();
2228 return NULL;
2229 }
2230 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2231 if (!target) {
2232 PyErr_NoMemory();
2233 return NULL;
2234 }
2235 }
2236 else {
2237 if (targetsize < targetlen) {
2238 PyErr_Format(PyExc_SystemError,
2239 "string is longer than the buffer");
2240 if (copy_null && 0 < targetsize)
2241 target[0] = 0;
2242 return NULL;
2243 }
2244 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002245 if (kind == PyUnicode_1BYTE_KIND) {
2246 Py_UCS1 *start = (Py_UCS1 *) data;
2247 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002249 else if (kind == PyUnicode_2BYTE_KIND) {
2250 Py_UCS2 *start = (Py_UCS2 *) data;
2251 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2252 }
2253 else {
2254 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002255 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002257 if (copy_null)
2258 target[len] = 0;
2259 return target;
2260}
2261
2262Py_UCS4*
2263PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2264 int copy_null)
2265{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002266 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267 PyErr_BadInternalCall();
2268 return NULL;
2269 }
2270 return as_ucs4(string, target, targetsize, copy_null);
2271}
2272
2273Py_UCS4*
2274PyUnicode_AsUCS4Copy(PyObject *string)
2275{
2276 return as_ucs4(string, NULL, 0, 1);
2277}
2278
2279#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002280
Alexander Belopolsky40018472011-02-26 01:02:56 +00002281PyObject *
2282PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002285 if (size == 0) {
2286 Py_INCREF(unicode_empty);
2287 return unicode_empty;
2288 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002289 PyErr_BadInternalCall();
2290 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 }
2292
Martin v. Löwis790465f2008-04-05 20:41:37 +00002293 if (size == -1) {
2294 size = wcslen(w);
2295 }
2296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002297 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298}
2299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002300#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002301
Walter Dörwald346737f2007-05-31 10:44:43 +00002302static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002303makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002304 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002305{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002306 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002307 if (longflag)
2308 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002309 else if (longlongflag) {
2310 /* longlongflag should only ever be nonzero on machines with
2311 HAVE_LONG_LONG defined */
2312#ifdef HAVE_LONG_LONG
2313 char *f = PY_FORMAT_LONG_LONG;
2314 while (*f)
2315 *fmt++ = *f++;
2316#else
2317 /* we shouldn't ever get here */
2318 assert(0);
2319 *fmt++ = 'l';
2320#endif
2321 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002322 else if (size_tflag) {
2323 char *f = PY_FORMAT_SIZE_T;
2324 while (*f)
2325 *fmt++ = *f++;
2326 }
2327 *fmt++ = c;
2328 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002329}
2330
Victor Stinner15a11362012-10-06 23:48:20 +02002331/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002332 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2333 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2334#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002335
2336static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002337unicode_fromformat_arg(_PyUnicodeWriter *writer,
2338 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002339{
Victor Stinnere215d962012-10-06 23:03:36 +02002340 const char *p;
2341 Py_ssize_t len;
2342 int zeropad;
2343 int width;
2344 int precision;
2345 int longflag;
2346 int longlongflag;
2347 int size_tflag;
2348 int fill;
2349
2350 p = f;
2351 f++;
2352 zeropad = (*f == '0');
Victor Stinner96865452011-03-01 23:44:09 +00002353
2354 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner96865452011-03-01 23:44:09 +00002355 width = 0;
Victor Stinnere215d962012-10-06 23:03:36 +02002356 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002357 if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2358 PyErr_SetString(PyExc_ValueError,
2359 "width too big");
2360 return NULL;
2361 }
Victor Stinnere215d962012-10-06 23:03:36 +02002362 width = (width*10) + (*f - '0');
2363 f++;
2364 }
Victor Stinner96865452011-03-01 23:44:09 +00002365 precision = 0;
2366 if (*f == '.') {
2367 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002368 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002369 if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2370 PyErr_SetString(PyExc_ValueError,
2371 "precision too big");
2372 return NULL;
2373 }
Victor Stinnere215d962012-10-06 23:03:36 +02002374 precision = (precision*10) + (*f - '0');
2375 f++;
2376 }
Victor Stinner96865452011-03-01 23:44:09 +00002377 if (*f == '%') {
2378 /* "%.3%s" => f points to "3" */
2379 f--;
2380 }
2381 }
2382 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002383 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002384 f--;
2385 }
Victor Stinner96865452011-03-01 23:44:09 +00002386
2387 /* Handle %ld, %lu, %lld and %llu. */
2388 longflag = 0;
2389 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002390 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002391 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002392 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002393 longflag = 1;
2394 ++f;
2395 }
2396#ifdef HAVE_LONG_LONG
2397 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002398 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002399 longlongflag = 1;
2400 f += 2;
2401 }
2402#endif
2403 }
2404 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002405 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002406 size_tflag = 1;
2407 ++f;
2408 }
Victor Stinnere215d962012-10-06 23:03:36 +02002409
2410 if (f[1] == '\0')
2411 writer->overallocate = 0;
2412
2413 switch (*f) {
2414 case 'c':
2415 {
2416 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002417 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2418 PyErr_SetString(PyExc_ValueError,
2419 "character argument not in range(0x110000)");
2420 return NULL;
2421 }
Victor Stinnere215d962012-10-06 23:03:36 +02002422 if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1)
2423 return NULL;
2424 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal);
2425 writer->pos++;
2426 break;
2427 }
2428
2429 case 'i':
2430 case 'd':
2431 case 'u':
2432 case 'x':
2433 {
2434 /* used by sprintf */
2435 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002436 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinnere215d962012-10-06 23:03:36 +02002437
2438 if (*f == 'u') {
2439 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2440
2441 if (longflag)
2442 len = sprintf(buffer, fmt,
2443 va_arg(*vargs, unsigned long));
2444#ifdef HAVE_LONG_LONG
2445 else if (longlongflag)
2446 len = sprintf(buffer, fmt,
2447 va_arg(*vargs, unsigned PY_LONG_LONG));
2448#endif
2449 else if (size_tflag)
2450 len = sprintf(buffer, fmt,
2451 va_arg(*vargs, size_t));
2452 else
2453 len = sprintf(buffer, fmt,
2454 va_arg(*vargs, unsigned int));
2455 }
2456 else if (*f == 'x') {
2457 makefmt(fmt, 0, 0, 0, 'x');
2458 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2459 }
2460 else {
2461 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2462
2463 if (longflag)
2464 len = sprintf(buffer, fmt,
2465 va_arg(*vargs, long));
2466#ifdef HAVE_LONG_LONG
2467 else if (longlongflag)
2468 len = sprintf(buffer, fmt,
2469 va_arg(*vargs, PY_LONG_LONG));
2470#endif
2471 else if (size_tflag)
2472 len = sprintf(buffer, fmt,
2473 va_arg(*vargs, Py_ssize_t));
2474 else
2475 len = sprintf(buffer, fmt,
2476 va_arg(*vargs, int));
2477 }
2478 assert(len >= 0);
2479
Victor Stinnere215d962012-10-06 23:03:36 +02002480 if (precision < len)
2481 precision = len;
2482 if (width > precision) {
2483 Py_UCS4 fillchar;
2484 fill = width - precision;
2485 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002486 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2487 return NULL;
2488 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2489 return NULL;
2490 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002491 }
Victor Stinner15a11362012-10-06 23:48:20 +02002492 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002493 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002494 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2495 return NULL;
2496 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2497 return NULL;
2498 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002499 }
Victor Stinner15a11362012-10-06 23:48:20 +02002500 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002501 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002502 break;
2503 }
2504
2505 case 'p':
2506 {
2507 char number[MAX_LONG_LONG_CHARS];
2508
2509 len = sprintf(number, "%p", va_arg(*vargs, void*));
2510 assert(len >= 0);
2511
2512 /* %p is ill-defined: ensure leading 0x. */
2513 if (number[1] == 'X')
2514 number[1] = 'x';
2515 else if (number[1] != 'x') {
2516 memmove(number + 2, number,
2517 strlen(number) + 1);
2518 number[0] = '0';
2519 number[1] = 'x';
2520 len += 2;
2521 }
2522
2523 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2524 return NULL;
2525 break;
2526 }
2527
2528 case 's':
2529 {
2530 /* UTF-8 */
2531 const char *s = va_arg(*vargs, const char*);
2532 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2533 if (!str)
2534 return NULL;
2535 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2536 Py_DECREF(str);
2537 return NULL;
2538 }
2539 Py_DECREF(str);
2540 break;
2541 }
2542
2543 case 'U':
2544 {
2545 PyObject *obj = va_arg(*vargs, PyObject *);
2546 assert(obj && _PyUnicode_CHECK(obj));
2547
2548 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2549 return NULL;
2550 break;
2551 }
2552
2553 case 'V':
2554 {
2555 PyObject *obj = va_arg(*vargs, PyObject *);
2556 const char *str = va_arg(*vargs, const char *);
2557 PyObject *str_obj;
2558 assert(obj || str);
2559 if (obj) {
2560 assert(_PyUnicode_CHECK(obj));
2561 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2562 return NULL;
2563 }
2564 else {
2565 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2566 if (!str_obj)
2567 return NULL;
2568 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2569 Py_DECREF(str_obj);
2570 return NULL;
2571 }
2572 Py_DECREF(str_obj);
2573 }
2574 break;
2575 }
2576
2577 case 'S':
2578 {
2579 PyObject *obj = va_arg(*vargs, PyObject *);
2580 PyObject *str;
2581 assert(obj);
2582 str = PyObject_Str(obj);
2583 if (!str)
2584 return NULL;
2585 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2586 Py_DECREF(str);
2587 return NULL;
2588 }
2589 Py_DECREF(str);
2590 break;
2591 }
2592
2593 case 'R':
2594 {
2595 PyObject *obj = va_arg(*vargs, PyObject *);
2596 PyObject *repr;
2597 assert(obj);
2598 repr = PyObject_Repr(obj);
2599 if (!repr)
2600 return NULL;
2601 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2602 Py_DECREF(repr);
2603 return NULL;
2604 }
2605 Py_DECREF(repr);
2606 break;
2607 }
2608
2609 case 'A':
2610 {
2611 PyObject *obj = va_arg(*vargs, PyObject *);
2612 PyObject *ascii;
2613 assert(obj);
2614 ascii = PyObject_ASCII(obj);
2615 if (!ascii)
2616 return NULL;
2617 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2618 Py_DECREF(ascii);
2619 return NULL;
2620 }
2621 Py_DECREF(ascii);
2622 break;
2623 }
2624
2625 case '%':
2626 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1)
2627 return NULL;
2628 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
2629 writer->pos++;
2630 break;
2631
2632 default:
2633 /* if we stumble upon an unknown formatting code, copy the rest
2634 of the format string to the output string. (we cannot just
2635 skip the code, since there's no way to know what's in the
2636 argument list) */
2637 len = strlen(p);
2638 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2639 return NULL;
2640 f = p+len;
2641 return f;
2642 }
2643
2644 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002645 return f;
2646}
2647
Walter Dörwaldd2034312007-05-18 16:29:38 +00002648PyObject *
2649PyUnicode_FromFormatV(const char *format, va_list vargs)
2650{
Victor Stinnere215d962012-10-06 23:03:36 +02002651 va_list vargs2;
2652 const char *f;
2653 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002654
Victor Stinnere215d962012-10-06 23:03:36 +02002655 _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
2656
2657 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2658 Copy it to be able to pass a reference to a subfunction. */
2659 Py_VA_COPY(vargs2, vargs);
2660
2661 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002662 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002663 f = unicode_fromformat_arg(&writer, f, &vargs2);
2664 if (f == NULL)
2665 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002666 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002667 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002668 const char *p;
2669 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002670
Victor Stinnere215d962012-10-06 23:03:36 +02002671 p = f;
2672 do
2673 {
2674 if ((unsigned char)*p > 127) {
2675 PyErr_Format(PyExc_ValueError,
2676 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2677 "string, got a non-ASCII byte: 0x%02x",
2678 (unsigned char)*p);
2679 return NULL;
2680 }
2681 p++;
2682 }
2683 while (*p != '\0' && *p != '%');
2684 len = p - f;
2685
2686 if (*p == '\0')
2687 writer.overallocate = 0;
2688 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2689 goto fail;
2690 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2691 writer.pos += len;
2692
2693 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002695 }
Victor Stinnere215d962012-10-06 23:03:36 +02002696 return _PyUnicodeWriter_Finish(&writer);
2697
2698 fail:
2699 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002701}
2702
Walter Dörwaldd2034312007-05-18 16:29:38 +00002703PyObject *
2704PyUnicode_FromFormat(const char *format, ...)
2705{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002706 PyObject* ret;
2707 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002708
2709#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002711#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002712 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002713#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002714 ret = PyUnicode_FromFormatV(format, vargs);
2715 va_end(vargs);
2716 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002717}
2718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002719#ifdef HAVE_WCHAR_H
2720
Victor Stinner5593d8a2010-10-02 11:11:27 +00002721/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2722 convert a Unicode object to a wide character string.
2723
Victor Stinnerd88d9832011-09-06 02:00:05 +02002724 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002725 character) required to convert the unicode object. Ignore size argument.
2726
Victor Stinnerd88d9832011-09-06 02:00:05 +02002727 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002728 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002729 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002731unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002732 wchar_t *w,
2733 Py_ssize_t size)
2734{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002735 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736 const wchar_t *wstr;
2737
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002738 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 if (wstr == NULL)
2740 return -1;
2741
Victor Stinner5593d8a2010-10-02 11:11:27 +00002742 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002743 if (size > res)
2744 size = res + 1;
2745 else
2746 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002747 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002748 return res;
2749 }
2750 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002752}
2753
2754Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002755PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002756 wchar_t *w,
2757 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758{
2759 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002760 PyErr_BadInternalCall();
2761 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002763 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764}
2765
Victor Stinner137c34c2010-09-29 10:25:54 +00002766wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002767PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002768 Py_ssize_t *size)
2769{
2770 wchar_t* buffer;
2771 Py_ssize_t buflen;
2772
2773 if (unicode == NULL) {
2774 PyErr_BadInternalCall();
2775 return NULL;
2776 }
2777
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002778 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 if (buflen == -1)
2780 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002781 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002782 PyErr_NoMemory();
2783 return NULL;
2784 }
2785
Victor Stinner137c34c2010-09-29 10:25:54 +00002786 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2787 if (buffer == NULL) {
2788 PyErr_NoMemory();
2789 return NULL;
2790 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002791 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002792 if (buflen == -1) {
2793 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002794 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002795 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002796 if (size != NULL)
2797 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002798 return buffer;
2799}
2800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002801#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802
Alexander Belopolsky40018472011-02-26 01:02:56 +00002803PyObject *
2804PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002806 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002807 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002808 PyErr_SetString(PyExc_ValueError,
2809 "chr() arg not in range(0x110000)");
2810 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002811 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002813 if (ordinal < 256)
2814 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002816 v = PyUnicode_New(1, ordinal);
2817 if (v == NULL)
2818 return NULL;
2819 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002820 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002821 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002822}
2823
Alexander Belopolsky40018472011-02-26 01:02:56 +00002824PyObject *
2825PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002827 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002828 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002829 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002830 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002831 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 Py_INCREF(obj);
2833 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002834 }
2835 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002836 /* For a Unicode subtype that's not a Unicode object,
2837 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002838 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002839 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002840 PyErr_Format(PyExc_TypeError,
2841 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002842 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002843 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002844}
2845
Alexander Belopolsky40018472011-02-26 01:02:56 +00002846PyObject *
2847PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002848 const char *encoding,
2849 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002850{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002851 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002852 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002853
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002855 PyErr_BadInternalCall();
2856 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002858
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002859 /* Decoding bytes objects is the most common case and should be fast */
2860 if (PyBytes_Check(obj)) {
2861 if (PyBytes_GET_SIZE(obj) == 0) {
2862 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002863 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002864 }
2865 else {
2866 v = PyUnicode_Decode(
2867 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2868 encoding, errors);
2869 }
2870 return v;
2871 }
2872
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002873 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002874 PyErr_SetString(PyExc_TypeError,
2875 "decoding str is not supported");
2876 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002877 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002878
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002879 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2880 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2881 PyErr_Format(PyExc_TypeError,
2882 "coercing to str: need bytes, bytearray "
2883 "or buffer-like object, %.80s found",
2884 Py_TYPE(obj)->tp_name);
2885 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002886 }
Tim Petersced69f82003-09-16 20:30:58 +00002887
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002888 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002889 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002890 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891 }
Tim Petersced69f82003-09-16 20:30:58 +00002892 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002893 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002894
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002895 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002896 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002897}
2898
Victor Stinner600d3be2010-06-10 12:00:55 +00002899/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002900 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2901 1 on success. */
2902static int
2903normalize_encoding(const char *encoding,
2904 char *lower,
2905 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002907 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002908 char *l;
2909 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002910
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002911 if (encoding == NULL) {
2912 strcpy(lower, "utf-8");
2913 return 1;
2914 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002915 e = encoding;
2916 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002917 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002918 while (*e) {
2919 if (l == l_end)
2920 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002921 if (Py_ISUPPER(*e)) {
2922 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002923 }
2924 else if (*e == '_') {
2925 *l++ = '-';
2926 e++;
2927 }
2928 else {
2929 *l++ = *e++;
2930 }
2931 }
2932 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002933 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002934}
2935
Alexander Belopolsky40018472011-02-26 01:02:56 +00002936PyObject *
2937PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002938 Py_ssize_t size,
2939 const char *encoding,
2940 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002941{
2942 PyObject *buffer = NULL, *unicode;
2943 Py_buffer info;
2944 char lower[11]; /* Enough for any encoding shortcut */
2945
Fred Drakee4315f52000-05-09 19:53:39 +00002946 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002947 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002948 if ((strcmp(lower, "utf-8") == 0) ||
2949 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002950 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002951 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002952 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002953 (strcmp(lower, "iso-8859-1") == 0))
2954 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002955#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002956 else if (strcmp(lower, "mbcs") == 0)
2957 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002958#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002959 else if (strcmp(lower, "ascii") == 0)
2960 return PyUnicode_DecodeASCII(s, size, errors);
2961 else if (strcmp(lower, "utf-16") == 0)
2962 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2963 else if (strcmp(lower, "utf-32") == 0)
2964 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2965 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966
2967 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002968 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002969 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002970 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002971 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972 if (buffer == NULL)
2973 goto onError;
2974 unicode = PyCodec_Decode(buffer, encoding, errors);
2975 if (unicode == NULL)
2976 goto onError;
2977 if (!PyUnicode_Check(unicode)) {
2978 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002979 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002980 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 Py_DECREF(unicode);
2982 goto onError;
2983 }
2984 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002985 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002986
Benjamin Peterson29060642009-01-31 22:14:21 +00002987 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 Py_XDECREF(buffer);
2989 return NULL;
2990}
2991
Alexander Belopolsky40018472011-02-26 01:02:56 +00002992PyObject *
2993PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002994 const char *encoding,
2995 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002996{
2997 PyObject *v;
2998
2999 if (!PyUnicode_Check(unicode)) {
3000 PyErr_BadArgument();
3001 goto onError;
3002 }
3003
3004 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003005 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003006
3007 /* Decode via the codec registry */
3008 v = PyCodec_Decode(unicode, encoding, errors);
3009 if (v == NULL)
3010 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003011 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003012
Benjamin Peterson29060642009-01-31 22:14:21 +00003013 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003014 return NULL;
3015}
3016
Alexander Belopolsky40018472011-02-26 01:02:56 +00003017PyObject *
3018PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003019 const char *encoding,
3020 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003021{
3022 PyObject *v;
3023
3024 if (!PyUnicode_Check(unicode)) {
3025 PyErr_BadArgument();
3026 goto onError;
3027 }
3028
3029 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003031
3032 /* Decode via the codec registry */
3033 v = PyCodec_Decode(unicode, encoding, errors);
3034 if (v == NULL)
3035 goto onError;
3036 if (!PyUnicode_Check(v)) {
3037 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003038 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003039 Py_TYPE(v)->tp_name);
3040 Py_DECREF(v);
3041 goto onError;
3042 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003043 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003044
Benjamin Peterson29060642009-01-31 22:14:21 +00003045 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003046 return NULL;
3047}
3048
Alexander Belopolsky40018472011-02-26 01:02:56 +00003049PyObject *
3050PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003051 Py_ssize_t size,
3052 const char *encoding,
3053 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054{
3055 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003056
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 unicode = PyUnicode_FromUnicode(s, size);
3058 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3061 Py_DECREF(unicode);
3062 return v;
3063}
3064
Alexander Belopolsky40018472011-02-26 01:02:56 +00003065PyObject *
3066PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003067 const char *encoding,
3068 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003069{
3070 PyObject *v;
3071
3072 if (!PyUnicode_Check(unicode)) {
3073 PyErr_BadArgument();
3074 goto onError;
3075 }
3076
3077 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003079
3080 /* Encode via the codec registry */
3081 v = PyCodec_Encode(unicode, encoding, errors);
3082 if (v == NULL)
3083 goto onError;
3084 return v;
3085
Benjamin Peterson29060642009-01-31 22:14:21 +00003086 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003087 return NULL;
3088}
3089
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003090static size_t
3091wcstombs_errorpos(const wchar_t *wstr)
3092{
3093 size_t len;
3094#if SIZEOF_WCHAR_T == 2
3095 wchar_t buf[3];
3096#else
3097 wchar_t buf[2];
3098#endif
3099 char outbuf[MB_LEN_MAX];
3100 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003101
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003102#if SIZEOF_WCHAR_T == 2
3103 buf[2] = 0;
3104#else
3105 buf[1] = 0;
3106#endif
3107 start = wstr;
3108 while (*wstr != L'\0')
3109 {
3110 previous = wstr;
3111#if SIZEOF_WCHAR_T == 2
3112 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3113 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3114 {
3115 buf[0] = wstr[0];
3116 buf[1] = wstr[1];
3117 wstr += 2;
3118 }
3119 else {
3120 buf[0] = *wstr;
3121 buf[1] = 0;
3122 wstr++;
3123 }
3124#else
3125 buf[0] = *wstr;
3126 wstr++;
3127#endif
3128 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003129 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003130 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003131 }
3132
3133 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003134 return 0;
3135}
3136
Victor Stinner1b579672011-12-17 05:47:23 +01003137static int
3138locale_error_handler(const char *errors, int *surrogateescape)
3139{
3140 if (errors == NULL) {
3141 *surrogateescape = 0;
3142 return 0;
3143 }
3144
3145 if (strcmp(errors, "strict") == 0) {
3146 *surrogateescape = 0;
3147 return 0;
3148 }
3149 if (strcmp(errors, "surrogateescape") == 0) {
3150 *surrogateescape = 1;
3151 return 0;
3152 }
3153 PyErr_Format(PyExc_ValueError,
3154 "only 'strict' and 'surrogateescape' error handlers "
3155 "are supported, not '%s'",
3156 errors);
3157 return -1;
3158}
3159
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003160PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003161PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003162{
3163 Py_ssize_t wlen, wlen2;
3164 wchar_t *wstr;
3165 PyObject *bytes = NULL;
3166 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003167 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003168 PyObject *exc;
3169 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003170 int surrogateescape;
3171
3172 if (locale_error_handler(errors, &surrogateescape) < 0)
3173 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003174
3175 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3176 if (wstr == NULL)
3177 return NULL;
3178
3179 wlen2 = wcslen(wstr);
3180 if (wlen2 != wlen) {
3181 PyMem_Free(wstr);
3182 PyErr_SetString(PyExc_TypeError, "embedded null character");
3183 return NULL;
3184 }
3185
3186 if (surrogateescape) {
3187 /* locale encoding with surrogateescape */
3188 char *str;
3189
3190 str = _Py_wchar2char(wstr, &error_pos);
3191 if (str == NULL) {
3192 if (error_pos == (size_t)-1) {
3193 PyErr_NoMemory();
3194 PyMem_Free(wstr);
3195 return NULL;
3196 }
3197 else {
3198 goto encode_error;
3199 }
3200 }
3201 PyMem_Free(wstr);
3202
3203 bytes = PyBytes_FromString(str);
3204 PyMem_Free(str);
3205 }
3206 else {
3207 size_t len, len2;
3208
3209 len = wcstombs(NULL, wstr, 0);
3210 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003211 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003212 goto encode_error;
3213 }
3214
3215 bytes = PyBytes_FromStringAndSize(NULL, len);
3216 if (bytes == NULL) {
3217 PyMem_Free(wstr);
3218 return NULL;
3219 }
3220
3221 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3222 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003223 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003224 goto encode_error;
3225 }
3226 PyMem_Free(wstr);
3227 }
3228 return bytes;
3229
3230encode_error:
3231 errmsg = strerror(errno);
3232 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003233
3234 if (error_pos == (size_t)-1)
3235 error_pos = wcstombs_errorpos(wstr);
3236
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003237 PyMem_Free(wstr);
3238 Py_XDECREF(bytes);
3239
Victor Stinner2f197072011-12-17 07:08:30 +01003240 if (errmsg != NULL) {
3241 size_t errlen;
3242 wstr = _Py_char2wchar(errmsg, &errlen);
3243 if (wstr != NULL) {
3244 reason = PyUnicode_FromWideChar(wstr, errlen);
3245 PyMem_Free(wstr);
3246 } else
3247 errmsg = NULL;
3248 }
3249 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003250 reason = PyUnicode_FromString(
3251 "wcstombs() encountered an unencodable "
3252 "wide character");
3253 if (reason == NULL)
3254 return NULL;
3255
3256 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3257 "locale", unicode,
3258 (Py_ssize_t)error_pos,
3259 (Py_ssize_t)(error_pos+1),
3260 reason);
3261 Py_DECREF(reason);
3262 if (exc != NULL) {
3263 PyCodec_StrictErrors(exc);
3264 Py_XDECREF(exc);
3265 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003266 return NULL;
3267}
3268
Victor Stinnerad158722010-10-27 00:25:46 +00003269PyObject *
3270PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003271{
Victor Stinner99b95382011-07-04 14:23:54 +02003272#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003273 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003274#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003275 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003276#else
Victor Stinner793b5312011-04-27 00:24:21 +02003277 PyInterpreterState *interp = PyThreadState_GET()->interp;
3278 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3279 cannot use it to encode and decode filenames before it is loaded. Load
3280 the Python codec requires to encode at least its own filename. Use the C
3281 version of the locale codec until the codec registry is initialized and
3282 the Python codec is loaded.
3283
3284 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3285 cannot only rely on it: check also interp->fscodec_initialized for
3286 subinterpreters. */
3287 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003288 return PyUnicode_AsEncodedString(unicode,
3289 Py_FileSystemDefaultEncoding,
3290 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003291 }
3292 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003293 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003294 }
Victor Stinnerad158722010-10-27 00:25:46 +00003295#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003296}
3297
Alexander Belopolsky40018472011-02-26 01:02:56 +00003298PyObject *
3299PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003300 const char *encoding,
3301 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302{
3303 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003304 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003305
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306 if (!PyUnicode_Check(unicode)) {
3307 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003308 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309 }
Fred Drakee4315f52000-05-09 19:53:39 +00003310
Fred Drakee4315f52000-05-09 19:53:39 +00003311 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003312 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003313 if ((strcmp(lower, "utf-8") == 0) ||
3314 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003315 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003316 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003317 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003318 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003319 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003320 }
Victor Stinner37296e82010-06-10 13:36:23 +00003321 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003322 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003323 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003324 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003325#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003326 else if (strcmp(lower, "mbcs") == 0)
3327 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003328#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003329 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003330 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003331 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332
3333 /* Encode via the codec registry */
3334 v = PyCodec_Encode(unicode, encoding, errors);
3335 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003336 return NULL;
3337
3338 /* The normal path */
3339 if (PyBytes_Check(v))
3340 return v;
3341
3342 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003343 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003344 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003345 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003346
3347 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3348 "encoder %s returned bytearray instead of bytes",
3349 encoding);
3350 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003351 Py_DECREF(v);
3352 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003353 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003354
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003355 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3356 Py_DECREF(v);
3357 return b;
3358 }
3359
3360 PyErr_Format(PyExc_TypeError,
3361 "encoder did not return a bytes object (type=%.400s)",
3362 Py_TYPE(v)->tp_name);
3363 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003364 return NULL;
3365}
3366
Alexander Belopolsky40018472011-02-26 01:02:56 +00003367PyObject *
3368PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003369 const char *encoding,
3370 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003371{
3372 PyObject *v;
3373
3374 if (!PyUnicode_Check(unicode)) {
3375 PyErr_BadArgument();
3376 goto onError;
3377 }
3378
3379 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003380 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003381
3382 /* Encode via the codec registry */
3383 v = PyCodec_Encode(unicode, encoding, errors);
3384 if (v == NULL)
3385 goto onError;
3386 if (!PyUnicode_Check(v)) {
3387 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003388 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003389 Py_TYPE(v)->tp_name);
3390 Py_DECREF(v);
3391 goto onError;
3392 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003394
Benjamin Peterson29060642009-01-31 22:14:21 +00003395 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003396 return NULL;
3397}
3398
Victor Stinner2f197072011-12-17 07:08:30 +01003399static size_t
3400mbstowcs_errorpos(const char *str, size_t len)
3401{
3402#ifdef HAVE_MBRTOWC
3403 const char *start = str;
3404 mbstate_t mbs;
3405 size_t converted;
3406 wchar_t ch;
3407
3408 memset(&mbs, 0, sizeof mbs);
3409 while (len)
3410 {
3411 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3412 if (converted == 0)
3413 /* Reached end of string */
3414 break;
3415 if (converted == (size_t)-1 || converted == (size_t)-2) {
3416 /* Conversion error or incomplete character */
3417 return str - start;
3418 }
3419 else {
3420 str += converted;
3421 len -= converted;
3422 }
3423 }
3424 /* failed to find the undecodable byte sequence */
3425 return 0;
3426#endif
3427 return 0;
3428}
3429
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003430PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003431PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003432 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003433{
3434 wchar_t smallbuf[256];
3435 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3436 wchar_t *wstr;
3437 size_t wlen, wlen2;
3438 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003439 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003440 size_t error_pos;
3441 char *errmsg;
3442 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003443
3444 if (locale_error_handler(errors, &surrogateescape) < 0)
3445 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003446
3447 if (str[len] != '\0' || len != strlen(str)) {
3448 PyErr_SetString(PyExc_TypeError, "embedded null character");
3449 return NULL;
3450 }
3451
3452 if (surrogateescape)
3453 {
3454 wstr = _Py_char2wchar(str, &wlen);
3455 if (wstr == NULL) {
3456 if (wlen == (size_t)-1)
3457 PyErr_NoMemory();
3458 else
3459 PyErr_SetFromErrno(PyExc_OSError);
3460 return NULL;
3461 }
3462
3463 unicode = PyUnicode_FromWideChar(wstr, wlen);
3464 PyMem_Free(wstr);
3465 }
3466 else {
3467#ifndef HAVE_BROKEN_MBSTOWCS
3468 wlen = mbstowcs(NULL, str, 0);
3469#else
3470 wlen = len;
3471#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003472 if (wlen == (size_t)-1)
3473 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003474 if (wlen+1 <= smallbuf_len) {
3475 wstr = smallbuf;
3476 }
3477 else {
3478 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3479 return PyErr_NoMemory();
3480
3481 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3482 if (!wstr)
3483 return PyErr_NoMemory();
3484 }
3485
3486 /* This shouldn't fail now */
3487 wlen2 = mbstowcs(wstr, str, wlen+1);
3488 if (wlen2 == (size_t)-1) {
3489 if (wstr != smallbuf)
3490 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003491 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003492 }
3493#ifdef HAVE_BROKEN_MBSTOWCS
3494 assert(wlen2 == wlen);
3495#endif
3496 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3497 if (wstr != smallbuf)
3498 PyMem_Free(wstr);
3499 }
3500 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003501
3502decode_error:
3503 errmsg = strerror(errno);
3504 assert(errmsg != NULL);
3505
3506 error_pos = mbstowcs_errorpos(str, len);
3507 if (errmsg != NULL) {
3508 size_t errlen;
3509 wstr = _Py_char2wchar(errmsg, &errlen);
3510 if (wstr != NULL) {
3511 reason = PyUnicode_FromWideChar(wstr, errlen);
3512 PyMem_Free(wstr);
3513 } else
3514 errmsg = NULL;
3515 }
3516 if (errmsg == NULL)
3517 reason = PyUnicode_FromString(
3518 "mbstowcs() encountered an invalid multibyte sequence");
3519 if (reason == NULL)
3520 return NULL;
3521
3522 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3523 "locale", str, len,
3524 (Py_ssize_t)error_pos,
3525 (Py_ssize_t)(error_pos+1),
3526 reason);
3527 Py_DECREF(reason);
3528 if (exc != NULL) {
3529 PyCodec_StrictErrors(exc);
3530 Py_XDECREF(exc);
3531 }
3532 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003533}
3534
3535PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003536PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003537{
3538 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003539 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003540}
3541
3542
3543PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003544PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003545 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003546 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3547}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003548
Christian Heimes5894ba72007-11-04 11:43:14 +00003549PyObject*
3550PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3551{
Victor Stinner99b95382011-07-04 14:23:54 +02003552#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003553 return PyUnicode_DecodeMBCS(s, size, NULL);
3554#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003555 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003556#else
Victor Stinner793b5312011-04-27 00:24:21 +02003557 PyInterpreterState *interp = PyThreadState_GET()->interp;
3558 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3559 cannot use it to encode and decode filenames before it is loaded. Load
3560 the Python codec requires to encode at least its own filename. Use the C
3561 version of the locale codec until the codec registry is initialized and
3562 the Python codec is loaded.
3563
3564 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3565 cannot only rely on it: check also interp->fscodec_initialized for
3566 subinterpreters. */
3567 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003568 return PyUnicode_Decode(s, size,
3569 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003570 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003571 }
3572 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003573 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003574 }
Victor Stinnerad158722010-10-27 00:25:46 +00003575#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003576}
3577
Martin v. Löwis011e8422009-05-05 04:43:17 +00003578
3579int
Antoine Pitrou13348842012-01-29 18:36:34 +01003580_PyUnicode_HasNULChars(PyObject* s)
3581{
3582 static PyObject *nul = NULL;
3583
3584 if (nul == NULL)
3585 nul = PyUnicode_FromStringAndSize("\0", 1);
3586 if (nul == NULL)
3587 return -1;
3588 return PyUnicode_Contains(s, nul);
3589}
3590
3591
3592int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003593PyUnicode_FSConverter(PyObject* arg, void* addr)
3594{
3595 PyObject *output = NULL;
3596 Py_ssize_t size;
3597 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003598 if (arg == NULL) {
3599 Py_DECREF(*(PyObject**)addr);
3600 return 1;
3601 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003602 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003603 output = arg;
3604 Py_INCREF(output);
3605 }
3606 else {
3607 arg = PyUnicode_FromObject(arg);
3608 if (!arg)
3609 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003610 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003611 Py_DECREF(arg);
3612 if (!output)
3613 return 0;
3614 if (!PyBytes_Check(output)) {
3615 Py_DECREF(output);
3616 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3617 return 0;
3618 }
3619 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003620 size = PyBytes_GET_SIZE(output);
3621 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003622 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003623 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003624 Py_DECREF(output);
3625 return 0;
3626 }
3627 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003628 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003629}
3630
3631
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003632int
3633PyUnicode_FSDecoder(PyObject* arg, void* addr)
3634{
3635 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003636 if (arg == NULL) {
3637 Py_DECREF(*(PyObject**)addr);
3638 return 1;
3639 }
3640 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003641 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003642 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003643 output = arg;
3644 Py_INCREF(output);
3645 }
3646 else {
3647 arg = PyBytes_FromObject(arg);
3648 if (!arg)
3649 return 0;
3650 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3651 PyBytes_GET_SIZE(arg));
3652 Py_DECREF(arg);
3653 if (!output)
3654 return 0;
3655 if (!PyUnicode_Check(output)) {
3656 Py_DECREF(output);
3657 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3658 return 0;
3659 }
3660 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003661 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003662 Py_DECREF(output);
3663 return 0;
3664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003665 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003666 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003667 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3668 Py_DECREF(output);
3669 return 0;
3670 }
3671 *(PyObject**)addr = output;
3672 return Py_CLEANUP_SUPPORTED;
3673}
3674
3675
Martin v. Löwis5b222132007-06-10 09:51:05 +00003676char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003677PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003678{
Christian Heimesf3863112007-11-22 07:46:41 +00003679 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003680
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003681 if (!PyUnicode_Check(unicode)) {
3682 PyErr_BadArgument();
3683 return NULL;
3684 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003685 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003686 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003687
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003688 if (PyUnicode_UTF8(unicode) == NULL) {
3689 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003690 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3691 if (bytes == NULL)
3692 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003693 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3694 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003695 Py_DECREF(bytes);
3696 return NULL;
3697 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003698 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3699 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3700 PyBytes_AS_STRING(bytes),
3701 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003702 Py_DECREF(bytes);
3703 }
3704
3705 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003706 *psize = PyUnicode_UTF8_LENGTH(unicode);
3707 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003708}
3709
3710char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003711PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003712{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003713 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3714}
3715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003716Py_UNICODE *
3717PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3718{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003719 const unsigned char *one_byte;
3720#if SIZEOF_WCHAR_T == 4
3721 const Py_UCS2 *two_bytes;
3722#else
3723 const Py_UCS4 *four_bytes;
3724 const Py_UCS4 *ucs4_end;
3725 Py_ssize_t num_surrogates;
3726#endif
3727 wchar_t *w;
3728 wchar_t *wchar_end;
3729
3730 if (!PyUnicode_Check(unicode)) {
3731 PyErr_BadArgument();
3732 return NULL;
3733 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003734 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003735 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003736 assert(_PyUnicode_KIND(unicode) != 0);
3737 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003738
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003739 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003740#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003741 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3742 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003743 num_surrogates = 0;
3744
3745 for (; four_bytes < ucs4_end; ++four_bytes) {
3746 if (*four_bytes > 0xFFFF)
3747 ++num_surrogates;
3748 }
3749
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003750 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3751 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3752 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003753 PyErr_NoMemory();
3754 return NULL;
3755 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003756 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003757
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003758 w = _PyUnicode_WSTR(unicode);
3759 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3760 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3762 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003763 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003764 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003765 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3766 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003767 }
3768 else
3769 *w = *four_bytes;
3770
3771 if (w > wchar_end) {
3772 assert(0 && "Miscalculated string end");
3773 }
3774 }
3775 *w = 0;
3776#else
3777 /* sizeof(wchar_t) == 4 */
3778 Py_FatalError("Impossible unicode object state, wstr and str "
3779 "should share memory already.");
3780 return NULL;
3781#endif
3782 }
3783 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003784 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3785 (_PyUnicode_LENGTH(unicode) + 1));
3786 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003787 PyErr_NoMemory();
3788 return NULL;
3789 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003790 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3791 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3792 w = _PyUnicode_WSTR(unicode);
3793 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003794
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003795 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3796 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003797 for (; w < wchar_end; ++one_byte, ++w)
3798 *w = *one_byte;
3799 /* null-terminate the wstr */
3800 *w = 0;
3801 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003802 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003803#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003804 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003805 for (; w < wchar_end; ++two_bytes, ++w)
3806 *w = *two_bytes;
3807 /* null-terminate the wstr */
3808 *w = 0;
3809#else
3810 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003811 PyObject_FREE(_PyUnicode_WSTR(unicode));
3812 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003813 Py_FatalError("Impossible unicode object state, wstr "
3814 "and str should share memory already.");
3815 return NULL;
3816#endif
3817 }
3818 else {
3819 assert(0 && "This should never happen.");
3820 }
3821 }
3822 }
3823 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003824 *size = PyUnicode_WSTR_LENGTH(unicode);
3825 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003826}
3827
Alexander Belopolsky40018472011-02-26 01:02:56 +00003828Py_UNICODE *
3829PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003831 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832}
3833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003834
Alexander Belopolsky40018472011-02-26 01:02:56 +00003835Py_ssize_t
3836PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837{
3838 if (!PyUnicode_Check(unicode)) {
3839 PyErr_BadArgument();
3840 goto onError;
3841 }
3842 return PyUnicode_GET_SIZE(unicode);
3843
Benjamin Peterson29060642009-01-31 22:14:21 +00003844 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003845 return -1;
3846}
3847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003848Py_ssize_t
3849PyUnicode_GetLength(PyObject *unicode)
3850{
Victor Stinner07621332012-06-16 04:53:46 +02003851 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003852 PyErr_BadArgument();
3853 return -1;
3854 }
Victor Stinner07621332012-06-16 04:53:46 +02003855 if (PyUnicode_READY(unicode) == -1)
3856 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857 return PyUnicode_GET_LENGTH(unicode);
3858}
3859
3860Py_UCS4
3861PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3862{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003863 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3864 PyErr_BadArgument();
3865 return (Py_UCS4)-1;
3866 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003867 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003868 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869 return (Py_UCS4)-1;
3870 }
3871 return PyUnicode_READ_CHAR(unicode, index);
3872}
3873
3874int
3875PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3876{
3877 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003878 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879 return -1;
3880 }
Victor Stinner488fa492011-12-12 00:01:39 +01003881 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003882 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003883 PyErr_SetString(PyExc_IndexError, "string index out of range");
3884 return -1;
3885 }
Victor Stinner488fa492011-12-12 00:01:39 +01003886 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003887 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003888 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3889 PyErr_SetString(PyExc_ValueError, "character out of range");
3890 return -1;
3891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003892 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3893 index, ch);
3894 return 0;
3895}
3896
Alexander Belopolsky40018472011-02-26 01:02:56 +00003897const char *
3898PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003899{
Victor Stinner42cb4622010-09-01 19:39:01 +00003900 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003901}
3902
Victor Stinner554f3f02010-06-16 23:33:54 +00003903/* create or adjust a UnicodeDecodeError */
3904static void
3905make_decode_exception(PyObject **exceptionObject,
3906 const char *encoding,
3907 const char *input, Py_ssize_t length,
3908 Py_ssize_t startpos, Py_ssize_t endpos,
3909 const char *reason)
3910{
3911 if (*exceptionObject == NULL) {
3912 *exceptionObject = PyUnicodeDecodeError_Create(
3913 encoding, input, length, startpos, endpos, reason);
3914 }
3915 else {
3916 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3917 goto onError;
3918 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3919 goto onError;
3920 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3921 goto onError;
3922 }
3923 return;
3924
3925onError:
3926 Py_DECREF(*exceptionObject);
3927 *exceptionObject = NULL;
3928}
3929
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003930/* error handling callback helper:
3931 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003932 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003933 and adjust various state variables.
3934 return 0 on success, -1 on error
3935*/
3936
Alexander Belopolsky40018472011-02-26 01:02:56 +00003937static int
3938unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003939 const char *encoding, const char *reason,
3940 const char **input, const char **inend, Py_ssize_t *startinpos,
3941 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003942 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003943{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003944 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003945
3946 PyObject *restuple = NULL;
3947 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003948 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003949 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003950 Py_ssize_t requiredsize;
3951 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003952 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003953 int res = -1;
3954
Victor Stinner596a6c42011-11-09 00:02:18 +01003955 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3956 outsize = PyUnicode_GET_LENGTH(*output);
3957 else
3958 outsize = _PyUnicode_WSTR_LENGTH(*output);
3959
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003961 *errorHandler = PyCodec_LookupError(errors);
3962 if (*errorHandler == NULL)
3963 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 }
3965
Victor Stinner554f3f02010-06-16 23:33:54 +00003966 make_decode_exception(exceptionObject,
3967 encoding,
3968 *input, *inend - *input,
3969 *startinpos, *endinpos,
3970 reason);
3971 if (*exceptionObject == NULL)
3972 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003973
3974 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3975 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003976 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003978 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003979 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980 }
3981 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003982 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05003983 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003984 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003985
3986 /* Copy back the bytes variables, which might have been modified by the
3987 callback */
3988 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3989 if (!inputobj)
3990 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003991 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003992 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003993 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003994 *input = PyBytes_AS_STRING(inputobj);
3995 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003996 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003997 /* we can DECREF safely, as the exception has another reference,
3998 so the object won't go away. */
3999 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004000
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004001 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004002 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004003 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004004 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4005 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004006 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004007
Victor Stinner596a6c42011-11-09 00:02:18 +01004008 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4009 /* need more space? (at least enough for what we
4010 have+the replacement+the rest of the string (starting
4011 at the new input position), so we won't have to check space
4012 when there are no errors in the rest of the string) */
4013 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4014 requiredsize = *outpos + replen + insize-newpos;
4015 if (requiredsize > outsize) {
4016 if (requiredsize<2*outsize)
4017 requiredsize = 2*outsize;
4018 if (unicode_resize(output, requiredsize) < 0)
4019 goto onError;
4020 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004021 if (unicode_widen(output, *outpos,
4022 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004023 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004024 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004025 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004026 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004027 else {
4028 wchar_t *repwstr;
4029 Py_ssize_t repwlen;
4030 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4031 if (repwstr == NULL)
4032 goto onError;
4033 /* need more space? (at least enough for what we
4034 have+the replacement+the rest of the string (starting
4035 at the new input position), so we won't have to check space
4036 when there are no errors in the rest of the string) */
4037 requiredsize = *outpos + repwlen + insize-newpos;
4038 if (requiredsize > outsize) {
4039 if (requiredsize < 2*outsize)
4040 requiredsize = 2*outsize;
4041 if (unicode_resize(output, requiredsize) < 0)
4042 goto onError;
4043 }
4044 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4045 *outpos += repwlen;
4046 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004047 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004048 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004049
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004050 /* we made it! */
4051 res = 0;
4052
Benjamin Peterson29060642009-01-31 22:14:21 +00004053 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054 Py_XDECREF(restuple);
4055 return res;
4056}
4057
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004058/* --- UTF-7 Codec -------------------------------------------------------- */
4059
Antoine Pitrou244651a2009-05-04 18:56:13 +00004060/* See RFC2152 for details. We encode conservatively and decode liberally. */
4061
4062/* Three simple macros defining base-64. */
4063
4064/* Is c a base-64 character? */
4065
4066#define IS_BASE64(c) \
4067 (((c) >= 'A' && (c) <= 'Z') || \
4068 ((c) >= 'a' && (c) <= 'z') || \
4069 ((c) >= '0' && (c) <= '9') || \
4070 (c) == '+' || (c) == '/')
4071
4072/* given that c is a base-64 character, what is its base-64 value? */
4073
4074#define FROM_BASE64(c) \
4075 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4076 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4077 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4078 (c) == '+' ? 62 : 63)
4079
4080/* What is the base-64 character of the bottom 6 bits of n? */
4081
4082#define TO_BASE64(n) \
4083 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4084
4085/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4086 * decoded as itself. We are permissive on decoding; the only ASCII
4087 * byte not decoding to itself is the + which begins a base64
4088 * string. */
4089
4090#define DECODE_DIRECT(c) \
4091 ((c) <= 127 && (c) != '+')
4092
4093/* The UTF-7 encoder treats ASCII characters differently according to
4094 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4095 * the above). See RFC2152. This array identifies these different
4096 * sets:
4097 * 0 : "Set D"
4098 * alphanumeric and '(),-./:?
4099 * 1 : "Set O"
4100 * !"#$%&*;<=>@[]^_`{|}
4101 * 2 : "whitespace"
4102 * ht nl cr sp
4103 * 3 : special (must be base64 encoded)
4104 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4105 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004106
Tim Petersced69f82003-09-16 20:30:58 +00004107static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004108char utf7_category[128] = {
4109/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4110 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4111/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4112 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4113/* sp ! " # $ % & ' ( ) * + , - . / */
4114 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4115/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4117/* @ A B C D E F G H I J K L M N O */
4118 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4119/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4121/* ` a b c d e f g h i j k l m n o */
4122 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4123/* p q r s t u v w x y z { | } ~ del */
4124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004125};
4126
Antoine Pitrou244651a2009-05-04 18:56:13 +00004127/* ENCODE_DIRECT: this character should be encoded as itself. The
4128 * answer depends on whether we are encoding set O as itself, and also
4129 * on whether we are encoding whitespace as itself. RFC2152 makes it
4130 * clear that the answers to these questions vary between
4131 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004132
Antoine Pitrou244651a2009-05-04 18:56:13 +00004133#define ENCODE_DIRECT(c, directO, directWS) \
4134 ((c) < 128 && (c) > 0 && \
4135 ((utf7_category[(c)] == 0) || \
4136 (directWS && (utf7_category[(c)] == 2)) || \
4137 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004138
Alexander Belopolsky40018472011-02-26 01:02:56 +00004139PyObject *
4140PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004141 Py_ssize_t size,
4142 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004143{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004144 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4145}
4146
Antoine Pitrou244651a2009-05-04 18:56:13 +00004147/* The decoder. The only state we preserve is our read position,
4148 * i.e. how many characters we have consumed. So if we end in the
4149 * middle of a shift sequence we have to back off the read position
4150 * and the output to the beginning of the sequence, otherwise we lose
4151 * all the shift state (seen bits, number of bits seen, high
4152 * surrogate). */
4153
Alexander Belopolsky40018472011-02-26 01:02:56 +00004154PyObject *
4155PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004156 Py_ssize_t size,
4157 const char *errors,
4158 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004159{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004161 Py_ssize_t startinpos;
4162 Py_ssize_t endinpos;
4163 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004164 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004165 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004166 const char *errmsg = "";
4167 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004168 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004169 unsigned int base64bits = 0;
4170 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004171 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 PyObject *errorHandler = NULL;
4173 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004174
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004175 /* Start off assuming it's all ASCII. Widen later as necessary. */
4176 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004177 if (!unicode)
4178 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004179 if (size == 0) {
4180 if (consumed)
4181 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004182 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004183 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004184
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004185 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004186 e = s + size;
4187
4188 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004189 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004190 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004191 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004192
Antoine Pitrou244651a2009-05-04 18:56:13 +00004193 if (inShift) { /* in a base-64 section */
4194 if (IS_BASE64(ch)) { /* consume a base-64 character */
4195 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4196 base64bits += 6;
4197 s++;
4198 if (base64bits >= 16) {
4199 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004200 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004201 base64bits -= 16;
4202 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4203 if (surrogate) {
4204 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004205 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4206 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004207 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4208 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004209 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004210 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004211 }
4212 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004213 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4214 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004215 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004216 }
4217 }
Victor Stinner551ac952011-11-29 22:58:13 +01004218 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004219 /* first surrogate */
4220 surrogate = outCh;
4221 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004222 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004223 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4224 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004225 }
4226 }
4227 }
4228 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004229 inShift = 0;
4230 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004231 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004232 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4233 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004234 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004235 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004236 if (base64bits > 0) { /* left-over bits */
4237 if (base64bits >= 6) {
4238 /* We've seen at least one base-64 character */
4239 errmsg = "partial character in shift sequence";
4240 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004241 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004242 else {
4243 /* Some bits remain; they should be zero */
4244 if (base64buffer != 0) {
4245 errmsg = "non-zero padding bits in shift sequence";
4246 goto utf7Error;
4247 }
4248 }
4249 }
4250 if (ch != '-') {
4251 /* '-' is absorbed; other terminating
4252 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004253 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4254 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004255 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004256 }
4257 }
4258 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004259 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004260 s++; /* consume '+' */
4261 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004262 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004263 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4264 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004265 }
4266 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004267 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004268 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004269 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004270 }
4271 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004272 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004273 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4274 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004275 s++;
4276 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004277 else {
4278 startinpos = s-starts;
4279 s++;
4280 errmsg = "unexpected special character";
4281 goto utf7Error;
4282 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004283 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004284utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004285 endinpos = s-starts;
4286 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004287 errors, &errorHandler,
4288 "utf7", errmsg,
4289 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004290 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004291 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004292 }
4293
Antoine Pitrou244651a2009-05-04 18:56:13 +00004294 /* end of string */
4295
4296 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4297 /* if we're in an inconsistent state, that's an error */
4298 if (surrogate ||
4299 (base64bits >= 6) ||
4300 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004301 endinpos = size;
4302 if (unicode_decode_call_errorhandler(
4303 errors, &errorHandler,
4304 "utf7", "unterminated shift sequence",
4305 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004306 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004307 goto onError;
4308 if (s < e)
4309 goto restart;
4310 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004311 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004312
4313 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004314 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004315 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004316 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004317 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004318 }
4319 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004320 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004321 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004322 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004324 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004325 goto onError;
4326
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327 Py_XDECREF(errorHandler);
4328 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004329 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004330
Benjamin Peterson29060642009-01-31 22:14:21 +00004331 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004332 Py_XDECREF(errorHandler);
4333 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334 Py_DECREF(unicode);
4335 return NULL;
4336}
4337
4338
Alexander Belopolsky40018472011-02-26 01:02:56 +00004339PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004340_PyUnicode_EncodeUTF7(PyObject *str,
4341 int base64SetO,
4342 int base64WhiteSpace,
4343 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004344{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004345 int kind;
4346 void *data;
4347 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004348 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004349 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004350 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004351 unsigned int base64bits = 0;
4352 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004353 char * out;
4354 char * start;
4355
Benjamin Petersonbac79492012-01-14 13:34:47 -05004356 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004357 return NULL;
4358 kind = PyUnicode_KIND(str);
4359 data = PyUnicode_DATA(str);
4360 len = PyUnicode_GET_LENGTH(str);
4361
4362 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004363 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004364
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004365 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004366 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004367 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004368 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004369 if (v == NULL)
4370 return NULL;
4371
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004372 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004373 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004374 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004375
Antoine Pitrou244651a2009-05-04 18:56:13 +00004376 if (inShift) {
4377 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4378 /* shifting out */
4379 if (base64bits) { /* output remaining bits */
4380 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4381 base64buffer = 0;
4382 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004383 }
4384 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004385 /* Characters not in the BASE64 set implicitly unshift the sequence
4386 so no '-' is required, except if the character is itself a '-' */
4387 if (IS_BASE64(ch) || ch == '-') {
4388 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004389 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004390 *out++ = (char) ch;
4391 }
4392 else {
4393 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004394 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004395 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004396 else { /* not in a shift sequence */
4397 if (ch == '+') {
4398 *out++ = '+';
4399 *out++ = '-';
4400 }
4401 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4402 *out++ = (char) ch;
4403 }
4404 else {
4405 *out++ = '+';
4406 inShift = 1;
4407 goto encode_char;
4408 }
4409 }
4410 continue;
4411encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004413 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004414
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 /* code first surrogate */
4416 base64bits += 16;
4417 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4418 while (base64bits >= 6) {
4419 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4420 base64bits -= 6;
4421 }
4422 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004423 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004424 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004425 base64bits += 16;
4426 base64buffer = (base64buffer << 16) | ch;
4427 while (base64bits >= 6) {
4428 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4429 base64bits -= 6;
4430 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004431 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004432 if (base64bits)
4433 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4434 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004435 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004436 if (_PyBytes_Resize(&v, out - start) < 0)
4437 return NULL;
4438 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004440PyObject *
4441PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4442 Py_ssize_t size,
4443 int base64SetO,
4444 int base64WhiteSpace,
4445 const char *errors)
4446{
4447 PyObject *result;
4448 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4449 if (tmp == NULL)
4450 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004451 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004452 base64WhiteSpace, errors);
4453 Py_DECREF(tmp);
4454 return result;
4455}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004456
Antoine Pitrou244651a2009-05-04 18:56:13 +00004457#undef IS_BASE64
4458#undef FROM_BASE64
4459#undef TO_BASE64
4460#undef DECODE_DIRECT
4461#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004462
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463/* --- UTF-8 Codec -------------------------------------------------------- */
4464
Alexander Belopolsky40018472011-02-26 01:02:56 +00004465PyObject *
4466PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004467 Py_ssize_t size,
4468 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469{
Walter Dörwald69652032004-09-07 20:24:22 +00004470 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4471}
4472
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004473#include "stringlib/asciilib.h"
4474#include "stringlib/codecs.h"
4475#include "stringlib/undef.h"
4476
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004477#include "stringlib/ucs1lib.h"
4478#include "stringlib/codecs.h"
4479#include "stringlib/undef.h"
4480
4481#include "stringlib/ucs2lib.h"
4482#include "stringlib/codecs.h"
4483#include "stringlib/undef.h"
4484
4485#include "stringlib/ucs4lib.h"
4486#include "stringlib/codecs.h"
4487#include "stringlib/undef.h"
4488
Antoine Pitrouab868312009-01-10 15:40:25 +00004489/* Mask to quickly check whether a C 'long' contains a
4490 non-ASCII, UTF8-encoded char. */
4491#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004492# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004493#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004494# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004495#else
4496# error C 'long' size should be either 4 or 8!
4497#endif
4498
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004499static Py_ssize_t
4500ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004501{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004502 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004503 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004504
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004505#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004506 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4507 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004508 /* Fast path, see in STRINGLIB(utf8_decode) for
4509 an explanation. */
4510 /* Help register allocation */
4511 register const char *_p = p;
4512 register Py_UCS1 * q = dest;
4513 while (_p < aligned_end) {
4514 unsigned long value = *(const unsigned long *) _p;
4515 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004516 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004517 *((unsigned long *)q) = value;
4518 _p += SIZEOF_LONG;
4519 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004520 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004521 p = _p;
4522 while (p < end) {
4523 if ((unsigned char)*p & 0x80)
4524 break;
4525 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004527 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004529#endif
4530 while (p < end) {
4531 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4532 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004533 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004534 /* Help register allocation */
4535 register const char *_p = p;
4536 while (_p < aligned_end) {
4537 unsigned long value = *(unsigned long *) _p;
4538 if (value & ASCII_CHAR_MASK)
4539 break;
4540 _p += SIZEOF_LONG;
4541 }
4542 p = _p;
4543 if (_p == end)
4544 break;
4545 }
4546 if ((unsigned char)*p & 0x80)
4547 break;
4548 ++p;
4549 }
4550 memcpy(dest, start, p - start);
4551 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004552}
Antoine Pitrouab868312009-01-10 15:40:25 +00004553
Victor Stinner785938e2011-12-11 20:09:03 +01004554PyObject *
4555PyUnicode_DecodeUTF8Stateful(const char *s,
4556 Py_ssize_t size,
4557 const char *errors,
4558 Py_ssize_t *consumed)
4559{
Victor Stinner785938e2011-12-11 20:09:03 +01004560 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004561 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004562 const char *end = s + size;
4563 Py_ssize_t outpos;
4564
4565 Py_ssize_t startinpos;
4566 Py_ssize_t endinpos;
4567 const char *errmsg = "";
4568 PyObject *errorHandler = NULL;
4569 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004570
4571 if (size == 0) {
4572 if (consumed)
4573 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004574 Py_INCREF(unicode_empty);
4575 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004576 }
4577
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004578 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4579 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004580 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004581 *consumed = 1;
4582 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004583 }
4584
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004585 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004586 if (!unicode)
4587 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004588
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004589 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4590 s += outpos;
4591 while (s < end) {
4592 Py_UCS4 ch;
4593 int kind = PyUnicode_KIND(unicode);
4594 if (kind == PyUnicode_1BYTE_KIND) {
4595 if (PyUnicode_IS_ASCII(unicode))
4596 ch = asciilib_utf8_decode(&s, end,
4597 PyUnicode_1BYTE_DATA(unicode), &outpos);
4598 else
4599 ch = ucs1lib_utf8_decode(&s, end,
4600 PyUnicode_1BYTE_DATA(unicode), &outpos);
4601 } else if (kind == PyUnicode_2BYTE_KIND) {
4602 ch = ucs2lib_utf8_decode(&s, end,
4603 PyUnicode_2BYTE_DATA(unicode), &outpos);
4604 } else {
4605 assert(kind == PyUnicode_4BYTE_KIND);
4606 ch = ucs4lib_utf8_decode(&s, end,
4607 PyUnicode_4BYTE_DATA(unicode), &outpos);
4608 }
4609
4610 switch (ch) {
4611 case 0:
4612 if (s == end || consumed)
4613 goto End;
4614 errmsg = "unexpected end of data";
4615 startinpos = s - starts;
4616 endinpos = startinpos + 1;
4617 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4618 endinpos++;
4619 break;
4620 case 1:
4621 errmsg = "invalid start byte";
4622 startinpos = s - starts;
4623 endinpos = startinpos + 1;
4624 break;
4625 case 2:
4626 errmsg = "invalid continuation byte";
4627 startinpos = s - starts;
4628 endinpos = startinpos + 1;
4629 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4630 endinpos++;
4631 break;
4632 default:
4633 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4634 goto onError;
4635 continue;
4636 }
4637
4638 if (unicode_decode_call_errorhandler(
4639 errors, &errorHandler,
4640 "utf-8", errmsg,
4641 &starts, &end, &startinpos, &endinpos, &exc, &s,
4642 &unicode, &outpos))
4643 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004644 }
4645
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004646End:
4647 if (unicode_resize(&unicode, outpos) < 0)
4648 goto onError;
4649
4650 if (consumed)
4651 *consumed = s - starts;
4652
4653 Py_XDECREF(errorHandler);
4654 Py_XDECREF(exc);
4655 assert(_PyUnicode_CheckConsistency(unicode, 1));
4656 return unicode;
4657
4658onError:
4659 Py_XDECREF(errorHandler);
4660 Py_XDECREF(exc);
4661 Py_XDECREF(unicode);
4662 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004663}
4664
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004665#ifdef __APPLE__
4666
4667/* Simplified UTF-8 decoder using surrogateescape error handler,
4668 used to decode the command line arguments on Mac OS X. */
4669
4670wchar_t*
4671_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4672{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004673 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004674 wchar_t *unicode;
4675 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004676
4677 /* Note: size will always be longer than the resulting Unicode
4678 character count */
4679 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4680 PyErr_NoMemory();
4681 return NULL;
4682 }
4683 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4684 if (!unicode)
4685 return NULL;
4686
4687 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004688 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004689 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004690 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004691 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004692#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004693 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004694#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004695 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004696#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004697 if (ch > 0xFF) {
4698#if SIZEOF_WCHAR_T == 4
4699 assert(0);
4700#else
4701 assert(Py_UNICODE_IS_SURROGATE(ch));
4702 /* compute and append the two surrogates: */
4703 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4704 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4705#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004706 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004707 else {
4708 if (!ch && s == e)
4709 break;
4710 /* surrogateescape */
4711 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4712 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004713 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004714 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004715 return unicode;
4716}
4717
4718#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004720/* Primary internal function which creates utf8 encoded bytes objects.
4721
4722 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004723 and allocate exactly as much space needed at the end. Else allocate the
4724 maximum possible needed (4 result bytes per Unicode character), and return
4725 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004726*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004727PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004728_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729{
Victor Stinner6099a032011-12-18 14:22:26 +01004730 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004731 void *data;
4732 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004734 if (!PyUnicode_Check(unicode)) {
4735 PyErr_BadArgument();
4736 return NULL;
4737 }
4738
4739 if (PyUnicode_READY(unicode) == -1)
4740 return NULL;
4741
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004742 if (PyUnicode_UTF8(unicode))
4743 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4744 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004745
4746 kind = PyUnicode_KIND(unicode);
4747 data = PyUnicode_DATA(unicode);
4748 size = PyUnicode_GET_LENGTH(unicode);
4749
Benjamin Petersonead6b532011-12-20 17:23:42 -06004750 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004751 default:
4752 assert(0);
4753 case PyUnicode_1BYTE_KIND:
4754 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4755 assert(!PyUnicode_IS_ASCII(unicode));
4756 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4757 case PyUnicode_2BYTE_KIND:
4758 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4759 case PyUnicode_4BYTE_KIND:
4760 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004761 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762}
4763
Alexander Belopolsky40018472011-02-26 01:02:56 +00004764PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004765PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4766 Py_ssize_t size,
4767 const char *errors)
4768{
4769 PyObject *v, *unicode;
4770
4771 unicode = PyUnicode_FromUnicode(s, size);
4772 if (unicode == NULL)
4773 return NULL;
4774 v = _PyUnicode_AsUTF8String(unicode, errors);
4775 Py_DECREF(unicode);
4776 return v;
4777}
4778
4779PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004780PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004782 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783}
4784
Walter Dörwald41980ca2007-08-16 21:55:45 +00004785/* --- UTF-32 Codec ------------------------------------------------------- */
4786
4787PyObject *
4788PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004789 Py_ssize_t size,
4790 const char *errors,
4791 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004792{
4793 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4794}
4795
4796PyObject *
4797PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004798 Py_ssize_t size,
4799 const char *errors,
4800 int *byteorder,
4801 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004802{
4803 const char *starts = s;
4804 Py_ssize_t startinpos;
4805 Py_ssize_t endinpos;
4806 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004807 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004808 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004809 int bo = 0; /* assume native ordering by default */
4810 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004811 /* Offsets from q for retrieving bytes in the right order. */
4812#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4813 int iorder[] = {0, 1, 2, 3};
4814#else
4815 int iorder[] = {3, 2, 1, 0};
4816#endif
4817 PyObject *errorHandler = NULL;
4818 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004819
Walter Dörwald41980ca2007-08-16 21:55:45 +00004820 q = (unsigned char *)s;
4821 e = q + size;
4822
4823 if (byteorder)
4824 bo = *byteorder;
4825
4826 /* Check for BOM marks (U+FEFF) in the input and adjust current
4827 byte order setting accordingly. In native mode, the leading BOM
4828 mark is skipped, in all other modes, it is copied to the output
4829 stream as-is (giving a ZWNBSP character). */
4830 if (bo == 0) {
4831 if (size >= 4) {
4832 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004833 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004834#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004835 if (bom == 0x0000FEFF) {
4836 q += 4;
4837 bo = -1;
4838 }
4839 else if (bom == 0xFFFE0000) {
4840 q += 4;
4841 bo = 1;
4842 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004843#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004844 if (bom == 0x0000FEFF) {
4845 q += 4;
4846 bo = 1;
4847 }
4848 else if (bom == 0xFFFE0000) {
4849 q += 4;
4850 bo = -1;
4851 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004852#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004853 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004854 }
4855
4856 if (bo == -1) {
4857 /* force LE */
4858 iorder[0] = 0;
4859 iorder[1] = 1;
4860 iorder[2] = 2;
4861 iorder[3] = 3;
4862 }
4863 else if (bo == 1) {
4864 /* force BE */
4865 iorder[0] = 3;
4866 iorder[1] = 2;
4867 iorder[2] = 1;
4868 iorder[3] = 0;
4869 }
4870
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004871 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004872 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004873 if (!unicode)
4874 return NULL;
4875 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01004876 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004877 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004878
Walter Dörwald41980ca2007-08-16 21:55:45 +00004879 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 Py_UCS4 ch;
4881 /* remaining bytes at the end? (size should be divisible by 4) */
4882 if (e-q<4) {
4883 if (consumed)
4884 break;
4885 errmsg = "truncated data";
4886 startinpos = ((const char *)q)-starts;
4887 endinpos = ((const char *)e)-starts;
4888 goto utf32Error;
4889 /* The remaining input chars are ignored if the callback
4890 chooses to skip the input */
4891 }
4892 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4893 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004894
Benjamin Peterson29060642009-01-31 22:14:21 +00004895 if (ch >= 0x110000)
4896 {
4897 errmsg = "codepoint not in range(0x110000)";
4898 startinpos = ((const char *)q)-starts;
4899 endinpos = startinpos+4;
4900 goto utf32Error;
4901 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004902 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4903 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00004904 q += 4;
4905 continue;
4906 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 if (unicode_decode_call_errorhandler(
4908 errors, &errorHandler,
4909 "utf32", errmsg,
4910 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004911 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004912 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004913 }
4914
4915 if (byteorder)
4916 *byteorder = bo;
4917
4918 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004919 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004920
4921 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01004922 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004923 goto onError;
4924
4925 Py_XDECREF(errorHandler);
4926 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004927 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004928
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004930 Py_DECREF(unicode);
4931 Py_XDECREF(errorHandler);
4932 Py_XDECREF(exc);
4933 return NULL;
4934}
4935
4936PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004937_PyUnicode_EncodeUTF32(PyObject *str,
4938 const char *errors,
4939 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004940{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004941 int kind;
4942 void *data;
4943 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004944 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004945 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004946 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004947 /* Offsets from p for storing byte pairs in the right order. */
4948#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4949 int iorder[] = {0, 1, 2, 3};
4950#else
4951 int iorder[] = {3, 2, 1, 0};
4952#endif
4953
Benjamin Peterson29060642009-01-31 22:14:21 +00004954#define STORECHAR(CH) \
4955 do { \
4956 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4957 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4958 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4959 p[iorder[0]] = (CH) & 0xff; \
4960 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004961 } while(0)
4962
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004963 if (!PyUnicode_Check(str)) {
4964 PyErr_BadArgument();
4965 return NULL;
4966 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004967 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004968 return NULL;
4969 kind = PyUnicode_KIND(str);
4970 data = PyUnicode_DATA(str);
4971 len = PyUnicode_GET_LENGTH(str);
4972
4973 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004974 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00004975 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004976 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004977 if (v == NULL)
4978 return NULL;
4979
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004980 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004981 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004982 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004983 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004984 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004985
4986 if (byteorder == -1) {
4987 /* force LE */
4988 iorder[0] = 0;
4989 iorder[1] = 1;
4990 iorder[2] = 2;
4991 iorder[3] = 3;
4992 }
4993 else if (byteorder == 1) {
4994 /* force BE */
4995 iorder[0] = 3;
4996 iorder[1] = 2;
4997 iorder[2] = 1;
4998 iorder[3] = 0;
4999 }
5000
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005001 for (i = 0; i < len; i++)
5002 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005003
5004 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005005 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005006#undef STORECHAR
5007}
5008
Alexander Belopolsky40018472011-02-26 01:02:56 +00005009PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005010PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5011 Py_ssize_t size,
5012 const char *errors,
5013 int byteorder)
5014{
5015 PyObject *result;
5016 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5017 if (tmp == NULL)
5018 return NULL;
5019 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5020 Py_DECREF(tmp);
5021 return result;
5022}
5023
5024PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005025PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005026{
Victor Stinnerb960b342011-11-20 19:12:52 +01005027 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005028}
5029
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030/* --- UTF-16 Codec ------------------------------------------------------- */
5031
Tim Peters772747b2001-08-09 22:21:55 +00005032PyObject *
5033PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005034 Py_ssize_t size,
5035 const char *errors,
5036 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037{
Walter Dörwald69652032004-09-07 20:24:22 +00005038 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5039}
5040
5041PyObject *
5042PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005043 Py_ssize_t size,
5044 const char *errors,
5045 int *byteorder,
5046 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005047{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005048 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005049 Py_ssize_t startinpos;
5050 Py_ssize_t endinpos;
5051 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005052 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005053 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005054 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005055 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005056 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005057 PyObject *errorHandler = NULL;
5058 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059
Tim Peters772747b2001-08-09 22:21:55 +00005060 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005061 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062
5063 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005064 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005066 /* Check for BOM marks (U+FEFF) in the input and adjust current
5067 byte order setting accordingly. In native mode, the leading BOM
5068 mark is skipped, in all other modes, it is copied to the output
5069 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005070 if (bo == 0 && size >= 2) {
5071 const Py_UCS4 bom = (q[1] << 8) | q[0];
5072 if (bom == 0xFEFF) {
5073 q += 2;
5074 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005075 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005076 else if (bom == 0xFFFE) {
5077 q += 2;
5078 bo = 1;
5079 }
5080 if (byteorder)
5081 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005082 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083
Antoine Pitrou63065d72012-05-15 23:48:04 +02005084 if (q == e) {
5085 if (consumed)
5086 *consumed = size;
5087 Py_INCREF(unicode_empty);
5088 return unicode_empty;
Tim Peters772747b2001-08-09 22:21:55 +00005089 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005090
Antoine Pitrouab868312009-01-10 15:40:25 +00005091#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005092 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005093#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005094 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005095#endif
Tim Peters772747b2001-08-09 22:21:55 +00005096
Antoine Pitrou63065d72012-05-15 23:48:04 +02005097 /* Note: size will always be longer than the resulting Unicode
5098 character count */
5099 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5100 if (!unicode)
5101 return NULL;
5102
5103 outpos = 0;
5104 while (1) {
5105 Py_UCS4 ch = 0;
5106 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005107 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005108 if (kind == PyUnicode_1BYTE_KIND) {
5109 if (PyUnicode_IS_ASCII(unicode))
5110 ch = asciilib_utf16_decode(&q, e,
5111 PyUnicode_1BYTE_DATA(unicode), &outpos,
5112 native_ordering);
5113 else
5114 ch = ucs1lib_utf16_decode(&q, e,
5115 PyUnicode_1BYTE_DATA(unicode), &outpos,
5116 native_ordering);
5117 } else if (kind == PyUnicode_2BYTE_KIND) {
5118 ch = ucs2lib_utf16_decode(&q, e,
5119 PyUnicode_2BYTE_DATA(unicode), &outpos,
5120 native_ordering);
5121 } else {
5122 assert(kind == PyUnicode_4BYTE_KIND);
5123 ch = ucs4lib_utf16_decode(&q, e,
5124 PyUnicode_4BYTE_DATA(unicode), &outpos,
5125 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005126 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005127 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005128
Antoine Pitrou63065d72012-05-15 23:48:04 +02005129 switch (ch)
5130 {
5131 case 0:
5132 /* remaining byte at the end? (size should be even) */
5133 if (q == e || consumed)
5134 goto End;
5135 errmsg = "truncated data";
5136 startinpos = ((const char *)q) - starts;
5137 endinpos = ((const char *)e) - starts;
5138 break;
5139 /* The remaining input chars are ignored if the callback
5140 chooses to skip the input */
5141 case 1:
5142 errmsg = "unexpected end of data";
5143 startinpos = ((const char *)q) - 2 - starts;
5144 endinpos = ((const char *)e) - starts;
5145 break;
5146 case 2:
5147 errmsg = "illegal encoding";
5148 startinpos = ((const char *)q) - 2 - starts;
5149 endinpos = startinpos + 2;
5150 break;
5151 case 3:
5152 errmsg = "illegal UTF-16 surrogate";
5153 startinpos = ((const char *)q) - 4 - starts;
5154 endinpos = startinpos + 2;
5155 break;
5156 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005157 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5158 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005159 continue;
5160 }
5161
Benjamin Peterson29060642009-01-31 22:14:21 +00005162 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005163 errors,
5164 &errorHandler,
5165 "utf16", errmsg,
5166 &starts,
5167 (const char **)&e,
5168 &startinpos,
5169 &endinpos,
5170 &exc,
5171 (const char **)&q,
5172 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005173 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005174 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 }
5176
Antoine Pitrou63065d72012-05-15 23:48:04 +02005177End:
Walter Dörwald69652032004-09-07 20:24:22 +00005178 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005179 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005180
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005182 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 goto onError;
5184
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005185 Py_XDECREF(errorHandler);
5186 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005187 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005191 Py_XDECREF(errorHandler);
5192 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 return NULL;
5194}
5195
Tim Peters772747b2001-08-09 22:21:55 +00005196PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005197_PyUnicode_EncodeUTF16(PyObject *str,
5198 const char *errors,
5199 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005201 enum PyUnicode_Kind kind;
5202 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005203 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005204 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005205 unsigned short *out;
5206 Py_ssize_t bytesize;
5207 Py_ssize_t pairs;
5208#ifdef WORDS_BIGENDIAN
5209 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005210#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005211 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005212#endif
5213
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005214 if (!PyUnicode_Check(str)) {
5215 PyErr_BadArgument();
5216 return NULL;
5217 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005218 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005219 return NULL;
5220 kind = PyUnicode_KIND(str);
5221 data = PyUnicode_DATA(str);
5222 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005223
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005224 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005225 if (kind == PyUnicode_4BYTE_KIND) {
5226 const Py_UCS4 *in = (const Py_UCS4 *)data;
5227 const Py_UCS4 *end = in + len;
5228 while (in < end)
5229 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005230 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005231 }
5232 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005234 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005235 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 if (v == NULL)
5237 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005239 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005240 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005241 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005243 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005244 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005245 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005246
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005247 switch (kind) {
5248 case PyUnicode_1BYTE_KIND: {
5249 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5250 break;
Tim Peters772747b2001-08-09 22:21:55 +00005251 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005252 case PyUnicode_2BYTE_KIND: {
5253 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5254 break;
Tim Peters772747b2001-08-09 22:21:55 +00005255 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005256 case PyUnicode_4BYTE_KIND: {
5257 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5258 break;
5259 }
5260 default:
5261 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005262 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005263
5264 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005265 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266}
5267
Alexander Belopolsky40018472011-02-26 01:02:56 +00005268PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005269PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5270 Py_ssize_t size,
5271 const char *errors,
5272 int byteorder)
5273{
5274 PyObject *result;
5275 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5276 if (tmp == NULL)
5277 return NULL;
5278 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5279 Py_DECREF(tmp);
5280 return result;
5281}
5282
5283PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005284PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005286 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287}
5288
5289/* --- Unicode Escape Codec ----------------------------------------------- */
5290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005291/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5292 if all the escapes in the string make it still a valid ASCII string.
5293 Returns -1 if any escapes were found which cause the string to
5294 pop out of ASCII range. Otherwise returns the length of the
5295 required buffer to hold the string.
5296 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005297static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005298length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5299{
5300 const unsigned char *p = (const unsigned char *)s;
5301 const unsigned char *end = p + size;
5302 Py_ssize_t length = 0;
5303
5304 if (size < 0)
5305 return -1;
5306
5307 for (; p < end; ++p) {
5308 if (*p > 127) {
5309 /* Non-ASCII */
5310 return -1;
5311 }
5312 else if (*p != '\\') {
5313 /* Normal character */
5314 ++length;
5315 }
5316 else {
5317 /* Backslash-escape, check next char */
5318 ++p;
5319 /* Escape sequence reaches till end of string or
5320 non-ASCII follow-up. */
5321 if (p >= end || *p > 127)
5322 return -1;
5323 switch (*p) {
5324 case '\n':
5325 /* backslash + \n result in zero characters */
5326 break;
5327 case '\\': case '\'': case '\"':
5328 case 'b': case 'f': case 't':
5329 case 'n': case 'r': case 'v': case 'a':
5330 ++length;
5331 break;
5332 case '0': case '1': case '2': case '3':
5333 case '4': case '5': case '6': case '7':
5334 case 'x': case 'u': case 'U': case 'N':
5335 /* these do not guarantee ASCII characters */
5336 return -1;
5337 default:
5338 /* count the backslash + the other character */
5339 length += 2;
5340 }
5341 }
5342 }
5343 return length;
5344}
5345
Fredrik Lundh06d12682001-01-24 07:59:11 +00005346static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005347
Alexander Belopolsky40018472011-02-26 01:02:56 +00005348PyObject *
5349PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005350 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005351 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005354 Py_ssize_t startinpos;
5355 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005356 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005357 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005359 char* message;
5360 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005361 PyObject *errorHandler = NULL;
5362 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005363 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005364 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005365
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005366 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005367
5368 /* After length_of_escaped_ascii_string() there are two alternatives,
5369 either the string is pure ASCII with named escapes like \n, etc.
5370 and we determined it's exact size (common case)
5371 or it contains \x, \u, ... escape sequences. then we create a
5372 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005373 if (len >= 0) {
5374 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005375 if (!v)
5376 goto onError;
5377 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005378 }
5379 else {
5380 /* Escaped strings will always be longer than the resulting
5381 Unicode string, so we start with size here and then reduce the
5382 length after conversion to the true value.
5383 (but if the error callback returns a long replacement string
5384 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005385 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005386 if (!v)
5387 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005388 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005389 }
5390
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005392 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005393 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005395
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396 while (s < end) {
5397 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005398 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005399 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005401 /* The only case in which i == ascii_length is a backslash
5402 followed by a newline. */
5403 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005404
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 /* Non-escape characters are interpreted as Unicode ordinals */
5406 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005407 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5408 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 continue;
5410 }
5411
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005412 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 /* \ - Escapes */
5414 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005415 c = *s++;
5416 if (s > end)
5417 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005418
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005419 /* The only case in which i == ascii_length is a backslash
5420 followed by a newline. */
5421 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005422
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005423 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424
Benjamin Peterson29060642009-01-31 22:14:21 +00005425 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005426#define WRITECHAR(ch) \
5427 do { \
5428 if (unicode_putchar(&v, &i, ch) < 0) \
5429 goto onError; \
5430 }while(0)
5431
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005433 case '\\': WRITECHAR('\\'); break;
5434 case '\'': WRITECHAR('\''); break;
5435 case '\"': WRITECHAR('\"'); break;
5436 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005437 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005438 case 'f': WRITECHAR('\014'); break;
5439 case 't': WRITECHAR('\t'); break;
5440 case 'n': WRITECHAR('\n'); break;
5441 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005442 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005443 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005444 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005445 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446
Benjamin Peterson29060642009-01-31 22:14:21 +00005447 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 case '0': case '1': case '2': case '3':
5449 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005450 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005451 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005452 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005453 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005454 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005456 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 break;
5458
Benjamin Peterson29060642009-01-31 22:14:21 +00005459 /* hex escapes */
5460 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005462 digits = 2;
5463 message = "truncated \\xXX escape";
5464 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465
Benjamin Peterson29060642009-01-31 22:14:21 +00005466 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005468 digits = 4;
5469 message = "truncated \\uXXXX escape";
5470 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005473 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005474 digits = 8;
5475 message = "truncated \\UXXXXXXXX escape";
5476 hexescape:
5477 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005478 if (s+digits>end) {
5479 endinpos = size;
5480 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005481 errors, &errorHandler,
5482 "unicodeescape", "end of string in escape sequence",
5483 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005484 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005485 goto onError;
5486 goto nextByte;
5487 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005488 for (j = 0; j < digits; ++j) {
5489 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005490 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005491 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005492 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 errors, &errorHandler,
5494 "unicodeescape", message,
5495 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005496 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005497 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005498 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005499 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005500 }
5501 chr = (chr<<4) & ~0xF;
5502 if (c >= '0' && c <= '9')
5503 chr += c - '0';
5504 else if (c >= 'a' && c <= 'f')
5505 chr += 10 + c - 'a';
5506 else
5507 chr += 10 + c - 'A';
5508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005509 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005510 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005511 /* _decoding_error will have already written into the
5512 target buffer. */
5513 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005514 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005515 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005516 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005517 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005518 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005519 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005520 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 errors, &errorHandler,
5522 "unicodeescape", "illegal Unicode character",
5523 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005524 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005525 goto onError;
5526 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005527 break;
5528
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005530 case 'N':
5531 message = "malformed \\N character escape";
5532 if (ucnhash_CAPI == NULL) {
5533 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005534 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5535 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005536 if (ucnhash_CAPI == NULL)
5537 goto ucnhashError;
5538 }
5539 if (*s == '{') {
5540 const char *start = s+1;
5541 /* look for the closing brace */
5542 while (*s != '}' && s < end)
5543 s++;
5544 if (s > start && s < end && *s == '}') {
5545 /* found a name. look it up in the unicode database */
5546 message = "unknown Unicode character name";
5547 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005548 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005549 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005550 goto store;
5551 }
5552 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005553 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005554 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005555 errors, &errorHandler,
5556 "unicodeescape", message,
5557 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005558 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005559 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005560 break;
5561
5562 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005563 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005564 message = "\\ at end of string";
5565 s--;
5566 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005567 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005568 errors, &errorHandler,
5569 "unicodeescape", message,
5570 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005571 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005572 goto onError;
5573 }
5574 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005575 WRITECHAR('\\');
5576 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005577 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005578 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005580 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005581 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005583#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005584
Victor Stinner16e6a802011-12-12 13:24:15 +01005585 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005586 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005587 Py_XDECREF(errorHandler);
5588 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005589 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005590
Benjamin Peterson29060642009-01-31 22:14:21 +00005591 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005592 PyErr_SetString(
5593 PyExc_UnicodeError,
5594 "\\N escapes not supported (can't load unicodedata module)"
5595 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005596 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005597 Py_XDECREF(errorHandler);
5598 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005599 return NULL;
5600
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005603 Py_XDECREF(errorHandler);
5604 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 return NULL;
5606}
5607
5608/* Return a Unicode-Escape string version of the Unicode object.
5609
5610 If quotes is true, the string is enclosed in u"" or u'' quotes as
5611 appropriate.
5612
5613*/
5614
Alexander Belopolsky40018472011-02-26 01:02:56 +00005615PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005616PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005618 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005619 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005621 int kind;
5622 void *data;
5623 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624
Ezio Melottie7f90372012-10-05 03:33:31 +03005625 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005626 escape.
5627
Ezio Melottie7f90372012-10-05 03:33:31 +03005628 For UCS1 strings it's '\xxx', 4 bytes per source character.
5629 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5630 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005631 */
5632
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005633 if (!PyUnicode_Check(unicode)) {
5634 PyErr_BadArgument();
5635 return NULL;
5636 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005637 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005638 return NULL;
5639 len = PyUnicode_GET_LENGTH(unicode);
5640 kind = PyUnicode_KIND(unicode);
5641 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005642 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005643 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5644 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5645 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5646 }
5647
5648 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005649 return PyBytes_FromStringAndSize(NULL, 0);
5650
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005651 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005652 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005653
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005654 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005655 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005656 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005657 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658 if (repr == NULL)
5659 return NULL;
5660
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005661 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005663 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005664 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005665
Walter Dörwald79e913e2007-05-12 11:08:06 +00005666 /* Escape backslashes */
5667 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668 *p++ = '\\';
5669 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005670 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005671 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005672
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005673 /* Map 21-bit characters to '\U00xxxxxx' */
5674 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005675 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005676 *p++ = '\\';
5677 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005678 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5679 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5680 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5681 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5682 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5683 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5684 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5685 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005686 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005687 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005688
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005690 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 *p++ = '\\';
5692 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005693 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5694 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5695 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5696 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005698
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005699 /* Map special whitespace to '\t', \n', '\r' */
5700 else if (ch == '\t') {
5701 *p++ = '\\';
5702 *p++ = 't';
5703 }
5704 else if (ch == '\n') {
5705 *p++ = '\\';
5706 *p++ = 'n';
5707 }
5708 else if (ch == '\r') {
5709 *p++ = '\\';
5710 *p++ = 'r';
5711 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005712
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005713 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005714 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005716 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005717 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5718 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005719 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005720
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721 /* Copy everything else as-is */
5722 else
5723 *p++ = (char) ch;
5724 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005726 assert(p - PyBytes_AS_STRING(repr) > 0);
5727 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5728 return NULL;
5729 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730}
5731
Alexander Belopolsky40018472011-02-26 01:02:56 +00005732PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005733PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5734 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005736 PyObject *result;
5737 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5738 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005740 result = PyUnicode_AsUnicodeEscapeString(tmp);
5741 Py_DECREF(tmp);
5742 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743}
5744
5745/* --- Raw Unicode Escape Codec ------------------------------------------- */
5746
Alexander Belopolsky40018472011-02-26 01:02:56 +00005747PyObject *
5748PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005749 Py_ssize_t size,
5750 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005752 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005753 Py_ssize_t startinpos;
5754 Py_ssize_t endinpos;
5755 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005756 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 const char *end;
5758 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005759 PyObject *errorHandler = NULL;
5760 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005761
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762 /* Escaped strings will always be longer than the resulting
5763 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005764 length after conversion to the true value. (But decoding error
5765 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005766 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005770 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005771 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 end = s + size;
5773 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 unsigned char c;
5775 Py_UCS4 x;
5776 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005777 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 /* Non-escape characters are interpreted as Unicode ordinals */
5780 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005781 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5782 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005784 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005785 startinpos = s-starts;
5786
5787 /* \u-escapes are only interpreted iff the number of leading
5788 backslashes if odd */
5789 bs = s;
5790 for (;s < end;) {
5791 if (*s != '\\')
5792 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005793 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5794 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005795 }
5796 if (((s - bs) & 1) == 0 ||
5797 s >= end ||
5798 (*s != 'u' && *s != 'U')) {
5799 continue;
5800 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005801 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 count = *s=='u' ? 4 : 8;
5803 s++;
5804
5805 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 for (x = 0, i = 0; i < count; ++i, ++s) {
5807 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005808 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005809 endinpos = s-starts;
5810 if (unicode_decode_call_errorhandler(
5811 errors, &errorHandler,
5812 "rawunicodeescape", "truncated \\uXXXX",
5813 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005814 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 goto onError;
5816 goto nextByte;
5817 }
5818 x = (x<<4) & ~0xF;
5819 if (c >= '0' && c <= '9')
5820 x += c - '0';
5821 else if (c >= 'a' && c <= 'f')
5822 x += 10 + c - 'a';
5823 else
5824 x += 10 + c - 'A';
5825 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005826 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005827 if (unicode_putchar(&v, &outpos, x) < 0)
5828 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005829 } else {
5830 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005831 if (unicode_decode_call_errorhandler(
5832 errors, &errorHandler,
5833 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005835 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005837 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 nextByte:
5839 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005840 }
Victor Stinner16e6a802011-12-12 13:24:15 +01005841 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005842 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005843 Py_XDECREF(errorHandler);
5844 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005845 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00005846
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005849 Py_XDECREF(errorHandler);
5850 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 return NULL;
5852}
5853
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005854
Alexander Belopolsky40018472011-02-26 01:02:56 +00005855PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005856PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005858 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 char *p;
5860 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005861 Py_ssize_t expandsize, pos;
5862 int kind;
5863 void *data;
5864 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005866 if (!PyUnicode_Check(unicode)) {
5867 PyErr_BadArgument();
5868 return NULL;
5869 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005870 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005871 return NULL;
5872 kind = PyUnicode_KIND(unicode);
5873 data = PyUnicode_DATA(unicode);
5874 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005875 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5876 bytes, and 1 byte characters 4. */
5877 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005878
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005879 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005880 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005881
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005882 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 if (repr == NULL)
5884 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005885 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005886 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005888 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005889 for (pos = 0; pos < len; pos++) {
5890 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005891 /* Map 32-bit characters to '\Uxxxxxxxx' */
5892 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005893 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005894 *p++ = '\\';
5895 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005896 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5897 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5898 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5899 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5900 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5901 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5902 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5903 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005904 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005905 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005906 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 *p++ = '\\';
5908 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005909 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5910 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5911 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5912 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005914 /* Copy everything else as-is */
5915 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 *p++ = (char) ch;
5917 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005918
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005919 assert(p > q);
5920 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005921 return NULL;
5922 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923}
5924
Alexander Belopolsky40018472011-02-26 01:02:56 +00005925PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005926PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5927 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005929 PyObject *result;
5930 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5931 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00005932 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005933 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5934 Py_DECREF(tmp);
5935 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936}
5937
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005938/* --- Unicode Internal Codec ------------------------------------------- */
5939
Alexander Belopolsky40018472011-02-26 01:02:56 +00005940PyObject *
5941_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005942 Py_ssize_t size,
5943 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005944{
5945 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005946 Py_ssize_t startinpos;
5947 Py_ssize_t endinpos;
5948 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005949 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005950 const char *end;
5951 const char *reason;
5952 PyObject *errorHandler = NULL;
5953 PyObject *exc = NULL;
5954
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005955 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02005956 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005957 1))
5958 return NULL;
5959
Thomas Wouters89f507f2006-12-13 04:49:30 +00005960 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005961 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005962 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005964 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005965 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005966 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005967 end = s + size;
5968
5969 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005970 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005971 Py_UCS4 ch;
5972 /* We copy the raw representation one byte at a time because the
5973 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005974 ((char *) &uch)[0] = s[0];
5975 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005976#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005977 ((char *) &uch)[2] = s[2];
5978 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005979#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005980 ch = uch;
5981
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005982 /* We have to sanity check the raw data, otherwise doom looms for
5983 some malformed UCS-4 data. */
5984 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005985#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005986 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005987#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005988 end-s < Py_UNICODE_SIZE
5989 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005990 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005991 startinpos = s - starts;
5992 if (end-s < Py_UNICODE_SIZE) {
5993 endinpos = end-starts;
5994 reason = "truncated input";
5995 }
5996 else {
5997 endinpos = s - starts + Py_UNICODE_SIZE;
5998 reason = "illegal code point (> 0x10FFFF)";
5999 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006000 if (unicode_decode_call_errorhandler(
6001 errors, &errorHandler,
6002 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006003 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006004 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006005 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006006 continue;
6007 }
6008
6009 s += Py_UNICODE_SIZE;
6010#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006011 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006012 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006013 Py_UNICODE uch2;
6014 ((char *) &uch2)[0] = s[0];
6015 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006016 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006017 {
Victor Stinner551ac952011-11-29 22:58:13 +01006018 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006019 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006020 }
6021 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006022#endif
6023
6024 if (unicode_putchar(&v, &outpos, ch) < 0)
6025 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006026 }
6027
Victor Stinner16e6a802011-12-12 13:24:15 +01006028 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006029 goto onError;
6030 Py_XDECREF(errorHandler);
6031 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006032 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006033
Benjamin Peterson29060642009-01-31 22:14:21 +00006034 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006035 Py_XDECREF(v);
6036 Py_XDECREF(errorHandler);
6037 Py_XDECREF(exc);
6038 return NULL;
6039}
6040
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041/* --- Latin-1 Codec ------------------------------------------------------ */
6042
Alexander Belopolsky40018472011-02-26 01:02:56 +00006043PyObject *
6044PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006045 Py_ssize_t size,
6046 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006049 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050}
6051
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006052/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006053static void
6054make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006055 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006056 PyObject *unicode,
6057 Py_ssize_t startpos, Py_ssize_t endpos,
6058 const char *reason)
6059{
6060 if (*exceptionObject == NULL) {
6061 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006062 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006063 encoding, unicode, startpos, endpos, reason);
6064 }
6065 else {
6066 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6067 goto onError;
6068 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6069 goto onError;
6070 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6071 goto onError;
6072 return;
6073 onError:
6074 Py_DECREF(*exceptionObject);
6075 *exceptionObject = NULL;
6076 }
6077}
6078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006080static void
6081raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006082 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006083 PyObject *unicode,
6084 Py_ssize_t startpos, Py_ssize_t endpos,
6085 const char *reason)
6086{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006087 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006088 encoding, unicode, startpos, endpos, reason);
6089 if (*exceptionObject != NULL)
6090 PyCodec_StrictErrors(*exceptionObject);
6091}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006092
6093/* error handling callback helper:
6094 build arguments, call the callback and check the arguments,
6095 put the result into newpos and return the replacement string, which
6096 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006097static PyObject *
6098unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006099 PyObject **errorHandler,
6100 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006101 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006102 Py_ssize_t startpos, Py_ssize_t endpos,
6103 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006104{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006105 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006106 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006107 PyObject *restuple;
6108 PyObject *resunicode;
6109
6110 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006111 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006112 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006114 }
6115
Benjamin Petersonbac79492012-01-14 13:34:47 -05006116 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006117 return NULL;
6118 len = PyUnicode_GET_LENGTH(unicode);
6119
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006120 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006121 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006122 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006124
6125 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006127 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006129 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006130 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006131 Py_DECREF(restuple);
6132 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006133 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006134 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 &resunicode, newpos)) {
6136 Py_DECREF(restuple);
6137 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006138 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006139 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6140 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6141 Py_DECREF(restuple);
6142 return NULL;
6143 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006144 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006145 *newpos = len + *newpos;
6146 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6148 Py_DECREF(restuple);
6149 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006150 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006151 Py_INCREF(resunicode);
6152 Py_DECREF(restuple);
6153 return resunicode;
6154}
6155
Alexander Belopolsky40018472011-02-26 01:02:56 +00006156static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006157unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006158 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006159 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006160{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006161 /* input state */
6162 Py_ssize_t pos=0, size;
6163 int kind;
6164 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006165 /* output object */
6166 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006167 /* pointer into the output */
6168 char *str;
6169 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006170 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006171 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6172 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006173 PyObject *errorHandler = NULL;
6174 PyObject *exc = NULL;
6175 /* the following variable is used for caching string comparisons
6176 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6177 int known_errorHandler = -1;
6178
Benjamin Petersonbac79492012-01-14 13:34:47 -05006179 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006180 return NULL;
6181 size = PyUnicode_GET_LENGTH(unicode);
6182 kind = PyUnicode_KIND(unicode);
6183 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006184 /* allocate enough for a simple encoding without
6185 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006186 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006187 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006188 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006189 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006190 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006191 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006192 ressize = size;
6193
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006194 while (pos < size) {
6195 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006196
Benjamin Peterson29060642009-01-31 22:14:21 +00006197 /* can we encode this? */
6198 if (c<limit) {
6199 /* no overflow check, because we know that the space is enough */
6200 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006201 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006202 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 Py_ssize_t requiredsize;
6205 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006206 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006208 Py_ssize_t collstart = pos;
6209 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006211 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 ++collend;
6213 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6214 if (known_errorHandler==-1) {
6215 if ((errors==NULL) || (!strcmp(errors, "strict")))
6216 known_errorHandler = 1;
6217 else if (!strcmp(errors, "replace"))
6218 known_errorHandler = 2;
6219 else if (!strcmp(errors, "ignore"))
6220 known_errorHandler = 3;
6221 else if (!strcmp(errors, "xmlcharrefreplace"))
6222 known_errorHandler = 4;
6223 else
6224 known_errorHandler = 0;
6225 }
6226 switch (known_errorHandler) {
6227 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006228 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 goto onError;
6230 case 2: /* replace */
6231 while (collstart++<collend)
6232 *str++ = '?'; /* fall through */
6233 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006234 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006235 break;
6236 case 4: /* xmlcharrefreplace */
6237 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006238 /* determine replacement size */
6239 for (i = collstart, repsize = 0; i < collend; ++i) {
6240 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6241 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006243 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006244 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006245 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006247 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006249 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006251 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006253 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006254 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006256 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006258 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 if (requiredsize > ressize) {
6260 if (requiredsize<2*ressize)
6261 requiredsize = 2*ressize;
6262 if (_PyBytes_Resize(&res, requiredsize))
6263 goto onError;
6264 str = PyBytes_AS_STRING(res) + respos;
6265 ressize = requiredsize;
6266 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006267 /* generate replacement */
6268 for (i = collstart; i < collend; ++i) {
6269 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006270 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006271 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006272 break;
6273 default:
6274 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006275 encoding, reason, unicode, &exc,
6276 collstart, collend, &newpos);
6277 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006278 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006280 if (PyBytes_Check(repunicode)) {
6281 /* Directly copy bytes result to output. */
6282 repsize = PyBytes_Size(repunicode);
6283 if (repsize > 1) {
6284 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006285 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006286 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6287 Py_DECREF(repunicode);
6288 goto onError;
6289 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006290 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006291 ressize += repsize-1;
6292 }
6293 memcpy(str, PyBytes_AsString(repunicode), repsize);
6294 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006295 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006296 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006297 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006298 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 /* need more space? (at least enough for what we
6300 have+the replacement+the rest of the string, so
6301 we won't have to check space for encodable characters) */
6302 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006303 repsize = PyUnicode_GET_LENGTH(repunicode);
6304 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006305 if (requiredsize > ressize) {
6306 if (requiredsize<2*ressize)
6307 requiredsize = 2*ressize;
6308 if (_PyBytes_Resize(&res, requiredsize)) {
6309 Py_DECREF(repunicode);
6310 goto onError;
6311 }
6312 str = PyBytes_AS_STRING(res) + respos;
6313 ressize = requiredsize;
6314 }
6315 /* check if there is anything unencodable in the replacement
6316 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006317 for (i = 0; repsize-->0; ++i, ++str) {
6318 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006319 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006320 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006321 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 Py_DECREF(repunicode);
6323 goto onError;
6324 }
6325 *str = (char)c;
6326 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006327 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006328 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006329 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006330 }
6331 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006332 /* Resize if we allocated to much */
6333 size = str - PyBytes_AS_STRING(res);
6334 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006335 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006336 if (_PyBytes_Resize(&res, size) < 0)
6337 goto onError;
6338 }
6339
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006340 Py_XDECREF(errorHandler);
6341 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006342 return res;
6343
6344 onError:
6345 Py_XDECREF(res);
6346 Py_XDECREF(errorHandler);
6347 Py_XDECREF(exc);
6348 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006349}
6350
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006351/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006352PyObject *
6353PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006354 Py_ssize_t size,
6355 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006357 PyObject *result;
6358 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6359 if (unicode == NULL)
6360 return NULL;
6361 result = unicode_encode_ucs1(unicode, errors, 256);
6362 Py_DECREF(unicode);
6363 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364}
6365
Alexander Belopolsky40018472011-02-26 01:02:56 +00006366PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006367_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368{
6369 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006370 PyErr_BadArgument();
6371 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006373 if (PyUnicode_READY(unicode) == -1)
6374 return NULL;
6375 /* Fast path: if it is a one-byte string, construct
6376 bytes object directly. */
6377 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6378 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6379 PyUnicode_GET_LENGTH(unicode));
6380 /* Non-Latin-1 characters present. Defer to above function to
6381 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006382 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006383}
6384
6385PyObject*
6386PyUnicode_AsLatin1String(PyObject *unicode)
6387{
6388 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389}
6390
6391/* --- 7-bit ASCII Codec -------------------------------------------------- */
6392
Alexander Belopolsky40018472011-02-26 01:02:56 +00006393PyObject *
6394PyUnicode_DecodeASCII(const char *s,
6395 Py_ssize_t size,
6396 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006398 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006399 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006400 int kind;
6401 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006402 Py_ssize_t startinpos;
6403 Py_ssize_t endinpos;
6404 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006405 const char *e;
6406 PyObject *errorHandler = NULL;
6407 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006408
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006409 if (size == 0) {
6410 Py_INCREF(unicode_empty);
6411 return unicode_empty;
6412 }
6413
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006415 if (size == 1 && (unsigned char)s[0] < 128)
6416 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006417
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006418 unicode = PyUnicode_New(size, 127);
6419 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006420 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006421
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006422 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006423 data = PyUnicode_1BYTE_DATA(unicode);
6424 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6425 if (outpos == size)
6426 return unicode;
6427
6428 s += outpos;
6429 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006431 register unsigned char c = (unsigned char)*s;
6432 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006433 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006434 ++s;
6435 }
6436 else {
6437 startinpos = s-starts;
6438 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 if (unicode_decode_call_errorhandler(
6440 errors, &errorHandler,
6441 "ascii", "ordinal not in range(128)",
6442 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006443 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006444 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006445 kind = PyUnicode_KIND(unicode);
6446 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006449 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006450 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451 Py_XDECREF(errorHandler);
6452 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006453 assert(_PyUnicode_CheckConsistency(unicode, 1));
6454 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006455
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006457 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006458 Py_XDECREF(errorHandler);
6459 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 return NULL;
6461}
6462
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006463/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006464PyObject *
6465PyUnicode_EncodeASCII(const Py_UNICODE *p,
6466 Py_ssize_t size,
6467 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006469 PyObject *result;
6470 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6471 if (unicode == NULL)
6472 return NULL;
6473 result = unicode_encode_ucs1(unicode, errors, 128);
6474 Py_DECREF(unicode);
6475 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476}
6477
Alexander Belopolsky40018472011-02-26 01:02:56 +00006478PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006479_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480{
6481 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 PyErr_BadArgument();
6483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006485 if (PyUnicode_READY(unicode) == -1)
6486 return NULL;
6487 /* Fast path: if it is an ASCII-only string, construct bytes object
6488 directly. Else defer to above function to raise the exception. */
6489 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6490 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6491 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006492 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006493}
6494
6495PyObject *
6496PyUnicode_AsASCIIString(PyObject *unicode)
6497{
6498 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006499}
6500
Victor Stinner99b95382011-07-04 14:23:54 +02006501#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006502
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006503/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006504
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006505#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006506#define NEED_RETRY
6507#endif
6508
Victor Stinner3a50e702011-10-18 21:21:00 +02006509#ifndef WC_ERR_INVALID_CHARS
6510# define WC_ERR_INVALID_CHARS 0x0080
6511#endif
6512
6513static char*
6514code_page_name(UINT code_page, PyObject **obj)
6515{
6516 *obj = NULL;
6517 if (code_page == CP_ACP)
6518 return "mbcs";
6519 if (code_page == CP_UTF7)
6520 return "CP_UTF7";
6521 if (code_page == CP_UTF8)
6522 return "CP_UTF8";
6523
6524 *obj = PyBytes_FromFormat("cp%u", code_page);
6525 if (*obj == NULL)
6526 return NULL;
6527 return PyBytes_AS_STRING(*obj);
6528}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006529
Alexander Belopolsky40018472011-02-26 01:02:56 +00006530static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006531is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006532{
6533 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006534 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006535
Victor Stinner3a50e702011-10-18 21:21:00 +02006536 if (!IsDBCSLeadByteEx(code_page, *curr))
6537 return 0;
6538
6539 prev = CharPrevExA(code_page, s, curr, 0);
6540 if (prev == curr)
6541 return 1;
6542 /* FIXME: This code is limited to "true" double-byte encodings,
6543 as it assumes an incomplete character consists of a single
6544 byte. */
6545 if (curr - prev == 2)
6546 return 1;
6547 if (!IsDBCSLeadByteEx(code_page, *prev))
6548 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006549 return 0;
6550}
6551
Victor Stinner3a50e702011-10-18 21:21:00 +02006552static DWORD
6553decode_code_page_flags(UINT code_page)
6554{
6555 if (code_page == CP_UTF7) {
6556 /* The CP_UTF7 decoder only supports flags=0 */
6557 return 0;
6558 }
6559 else
6560 return MB_ERR_INVALID_CHARS;
6561}
6562
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006563/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006564 * Decode a byte string from a Windows code page into unicode object in strict
6565 * mode.
6566 *
6567 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6568 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006569 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006570static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006571decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006572 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006573 const char *in,
6574 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006575{
Victor Stinner3a50e702011-10-18 21:21:00 +02006576 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006577 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006578 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006579
6580 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006581 assert(insize > 0);
6582 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6583 if (outsize <= 0)
6584 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006585
6586 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006587 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006588 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006589 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006590 if (*v == NULL)
6591 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006592 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006593 }
6594 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006595 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006596 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006597 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006599 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006600 }
6601
6602 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006603 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6604 if (outsize <= 0)
6605 goto error;
6606 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006607
Victor Stinner3a50e702011-10-18 21:21:00 +02006608error:
6609 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6610 return -2;
6611 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006612 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006613}
6614
Victor Stinner3a50e702011-10-18 21:21:00 +02006615/*
6616 * Decode a byte string from a code page into unicode object with an error
6617 * handler.
6618 *
6619 * Returns consumed size if succeed, or raise a WindowsError or
6620 * UnicodeDecodeError exception and returns -1 on error.
6621 */
6622static int
6623decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006624 PyObject **v,
6625 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006626 const char *errors)
6627{
6628 const char *startin = in;
6629 const char *endin = in + size;
6630 const DWORD flags = decode_code_page_flags(code_page);
6631 /* Ideally, we should get reason from FormatMessage. This is the Windows
6632 2000 English version of the message. */
6633 const char *reason = "No mapping for the Unicode character exists "
6634 "in the target code page.";
6635 /* each step cannot decode more than 1 character, but a character can be
6636 represented as a surrogate pair */
6637 wchar_t buffer[2], *startout, *out;
6638 int insize, outsize;
6639 PyObject *errorHandler = NULL;
6640 PyObject *exc = NULL;
6641 PyObject *encoding_obj = NULL;
6642 char *encoding;
6643 DWORD err;
6644 int ret = -1;
6645
6646 assert(size > 0);
6647
6648 encoding = code_page_name(code_page, &encoding_obj);
6649 if (encoding == NULL)
6650 return -1;
6651
6652 if (errors == NULL || strcmp(errors, "strict") == 0) {
6653 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6654 UnicodeDecodeError. */
6655 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6656 if (exc != NULL) {
6657 PyCodec_StrictErrors(exc);
6658 Py_CLEAR(exc);
6659 }
6660 goto error;
6661 }
6662
6663 if (*v == NULL) {
6664 /* Create unicode object */
6665 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6666 PyErr_NoMemory();
6667 goto error;
6668 }
Victor Stinnerab595942011-12-17 04:59:06 +01006669 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006670 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006671 if (*v == NULL)
6672 goto error;
6673 startout = PyUnicode_AS_UNICODE(*v);
6674 }
6675 else {
6676 /* Extend unicode object */
6677 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6678 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6679 PyErr_NoMemory();
6680 goto error;
6681 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006682 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006683 goto error;
6684 startout = PyUnicode_AS_UNICODE(*v) + n;
6685 }
6686
6687 /* Decode the byte string character per character */
6688 out = startout;
6689 while (in < endin)
6690 {
6691 /* Decode a character */
6692 insize = 1;
6693 do
6694 {
6695 outsize = MultiByteToWideChar(code_page, flags,
6696 in, insize,
6697 buffer, Py_ARRAY_LENGTH(buffer));
6698 if (outsize > 0)
6699 break;
6700 err = GetLastError();
6701 if (err != ERROR_NO_UNICODE_TRANSLATION
6702 && err != ERROR_INSUFFICIENT_BUFFER)
6703 {
6704 PyErr_SetFromWindowsErr(0);
6705 goto error;
6706 }
6707 insize++;
6708 }
6709 /* 4=maximum length of a UTF-8 sequence */
6710 while (insize <= 4 && (in + insize) <= endin);
6711
6712 if (outsize <= 0) {
6713 Py_ssize_t startinpos, endinpos, outpos;
6714
6715 startinpos = in - startin;
6716 endinpos = startinpos + 1;
6717 outpos = out - PyUnicode_AS_UNICODE(*v);
6718 if (unicode_decode_call_errorhandler(
6719 errors, &errorHandler,
6720 encoding, reason,
6721 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006722 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006723 {
6724 goto error;
6725 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006726 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006727 }
6728 else {
6729 in += insize;
6730 memcpy(out, buffer, outsize * sizeof(wchar_t));
6731 out += outsize;
6732 }
6733 }
6734
6735 /* write a NUL character at the end */
6736 *out = 0;
6737
6738 /* Extend unicode object */
6739 outsize = out - startout;
6740 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006741 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006742 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006743 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006744
6745error:
6746 Py_XDECREF(encoding_obj);
6747 Py_XDECREF(errorHandler);
6748 Py_XDECREF(exc);
6749 return ret;
6750}
6751
Victor Stinner3a50e702011-10-18 21:21:00 +02006752static PyObject *
6753decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006754 const char *s, Py_ssize_t size,
6755 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006756{
Victor Stinner76a31a62011-11-04 00:05:13 +01006757 PyObject *v = NULL;
6758 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006759
Victor Stinner3a50e702011-10-18 21:21:00 +02006760 if (code_page < 0) {
6761 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6762 return NULL;
6763 }
6764
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006765 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006766 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006767
Victor Stinner76a31a62011-11-04 00:05:13 +01006768 do
6769 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006770#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006771 if (size > INT_MAX) {
6772 chunk_size = INT_MAX;
6773 final = 0;
6774 done = 0;
6775 }
6776 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006777#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006778 {
6779 chunk_size = (int)size;
6780 final = (consumed == NULL);
6781 done = 1;
6782 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006783
Victor Stinner76a31a62011-11-04 00:05:13 +01006784 /* Skip trailing lead-byte unless 'final' is set */
6785 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6786 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006787
Victor Stinner76a31a62011-11-04 00:05:13 +01006788 if (chunk_size == 0 && done) {
6789 if (v != NULL)
6790 break;
6791 Py_INCREF(unicode_empty);
6792 return unicode_empty;
6793 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006794
Victor Stinner76a31a62011-11-04 00:05:13 +01006795
6796 converted = decode_code_page_strict(code_page, &v,
6797 s, chunk_size);
6798 if (converted == -2)
6799 converted = decode_code_page_errors(code_page, &v,
6800 s, chunk_size,
6801 errors);
6802 assert(converted != 0);
6803
6804 if (converted < 0) {
6805 Py_XDECREF(v);
6806 return NULL;
6807 }
6808
6809 if (consumed)
6810 *consumed += converted;
6811
6812 s += converted;
6813 size -= converted;
6814 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006815
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006816 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006817}
6818
Alexander Belopolsky40018472011-02-26 01:02:56 +00006819PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006820PyUnicode_DecodeCodePageStateful(int code_page,
6821 const char *s,
6822 Py_ssize_t size,
6823 const char *errors,
6824 Py_ssize_t *consumed)
6825{
6826 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6827}
6828
6829PyObject *
6830PyUnicode_DecodeMBCSStateful(const char *s,
6831 Py_ssize_t size,
6832 const char *errors,
6833 Py_ssize_t *consumed)
6834{
6835 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6836}
6837
6838PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006839PyUnicode_DecodeMBCS(const char *s,
6840 Py_ssize_t size,
6841 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006842{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6844}
6845
Victor Stinner3a50e702011-10-18 21:21:00 +02006846static DWORD
6847encode_code_page_flags(UINT code_page, const char *errors)
6848{
6849 if (code_page == CP_UTF8) {
6850 if (winver.dwMajorVersion >= 6)
6851 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6852 and later */
6853 return WC_ERR_INVALID_CHARS;
6854 else
6855 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6856 return 0;
6857 }
6858 else if (code_page == CP_UTF7) {
6859 /* CP_UTF7 only supports flags=0 */
6860 return 0;
6861 }
6862 else {
6863 if (errors != NULL && strcmp(errors, "replace") == 0)
6864 return 0;
6865 else
6866 return WC_NO_BEST_FIT_CHARS;
6867 }
6868}
6869
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006870/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006871 * Encode a Unicode string to a Windows code page into a byte string in strict
6872 * mode.
6873 *
6874 * Returns consumed characters if succeed, returns -2 on encode error, or raise
6875 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006876 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006877static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006878encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006879 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006880 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006881{
Victor Stinner554f3f02010-06-16 23:33:54 +00006882 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006883 BOOL *pusedDefaultChar = &usedDefaultChar;
6884 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006885 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006886 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006887 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006888 const DWORD flags = encode_code_page_flags(code_page, NULL);
6889 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006890 /* Create a substring so that we can get the UTF-16 representation
6891 of just the slice under consideration. */
6892 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006893
Martin v. Löwis3d325192011-11-04 18:23:06 +01006894 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006895
Victor Stinner3a50e702011-10-18 21:21:00 +02006896 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006897 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006898 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006899 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006900
Victor Stinner2fc507f2011-11-04 20:06:39 +01006901 substring = PyUnicode_Substring(unicode, offset, offset+len);
6902 if (substring == NULL)
6903 return -1;
6904 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6905 if (p == NULL) {
6906 Py_DECREF(substring);
6907 return -1;
6908 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01006909
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006910 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006911 outsize = WideCharToMultiByte(code_page, flags,
6912 p, size,
6913 NULL, 0,
6914 NULL, pusedDefaultChar);
6915 if (outsize <= 0)
6916 goto error;
6917 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01006918 if (pusedDefaultChar && *pusedDefaultChar) {
6919 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006920 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006921 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006922
Victor Stinner3a50e702011-10-18 21:21:00 +02006923 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006925 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006926 if (*outbytes == NULL) {
6927 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006929 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006930 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006931 }
6932 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006933 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006934 const Py_ssize_t n = PyBytes_Size(*outbytes);
6935 if (outsize > PY_SSIZE_T_MAX - n) {
6936 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01006937 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006938 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006939 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01006940 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6941 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006942 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006943 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006944 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006945 }
6946
6947 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006948 outsize = WideCharToMultiByte(code_page, flags,
6949 p, size,
6950 out, outsize,
6951 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006952 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006953 if (outsize <= 0)
6954 goto error;
6955 if (pusedDefaultChar && *pusedDefaultChar)
6956 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006957 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006958
Victor Stinner3a50e702011-10-18 21:21:00 +02006959error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01006960 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006961 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6962 return -2;
6963 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006964 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006965}
6966
Victor Stinner3a50e702011-10-18 21:21:00 +02006967/*
6968 * Encode a Unicode string to a Windows code page into a byte string using a
6969 * error handler.
6970 *
6971 * Returns consumed characters if succeed, or raise a WindowsError and returns
6972 * -1 on other error.
6973 */
6974static int
6975encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01006976 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006977 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006978{
Victor Stinner3a50e702011-10-18 21:21:00 +02006979 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006980 Py_ssize_t pos = unicode_offset;
6981 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006982 /* Ideally, we should get reason from FormatMessage. This is the Windows
6983 2000 English version of the message. */
6984 const char *reason = "invalid character";
6985 /* 4=maximum length of a UTF-8 sequence */
6986 char buffer[4];
6987 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
6988 Py_ssize_t outsize;
6989 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006990 PyObject *errorHandler = NULL;
6991 PyObject *exc = NULL;
6992 PyObject *encoding_obj = NULL;
6993 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01006994 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006995 PyObject *rep;
6996 int ret = -1;
6997
6998 assert(insize > 0);
6999
7000 encoding = code_page_name(code_page, &encoding_obj);
7001 if (encoding == NULL)
7002 return -1;
7003
7004 if (errors == NULL || strcmp(errors, "strict") == 0) {
7005 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7006 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007007 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007008 if (exc != NULL) {
7009 PyCodec_StrictErrors(exc);
7010 Py_DECREF(exc);
7011 }
7012 Py_XDECREF(encoding_obj);
7013 return -1;
7014 }
7015
7016 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7017 pusedDefaultChar = &usedDefaultChar;
7018 else
7019 pusedDefaultChar = NULL;
7020
7021 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7022 PyErr_NoMemory();
7023 goto error;
7024 }
7025 outsize = insize * Py_ARRAY_LENGTH(buffer);
7026
7027 if (*outbytes == NULL) {
7028 /* Create string object */
7029 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7030 if (*outbytes == NULL)
7031 goto error;
7032 out = PyBytes_AS_STRING(*outbytes);
7033 }
7034 else {
7035 /* Extend string object */
7036 Py_ssize_t n = PyBytes_Size(*outbytes);
7037 if (n > PY_SSIZE_T_MAX - outsize) {
7038 PyErr_NoMemory();
7039 goto error;
7040 }
7041 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7042 goto error;
7043 out = PyBytes_AS_STRING(*outbytes) + n;
7044 }
7045
7046 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007047 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007048 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007049 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7050 wchar_t chars[2];
7051 int charsize;
7052 if (ch < 0x10000) {
7053 chars[0] = (wchar_t)ch;
7054 charsize = 1;
7055 }
7056 else {
7057 ch -= 0x10000;
7058 chars[0] = 0xd800 + (ch >> 10);
7059 chars[1] = 0xdc00 + (ch & 0x3ff);
7060 charsize = 2;
7061 }
7062
Victor Stinner3a50e702011-10-18 21:21:00 +02007063 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007064 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007065 buffer, Py_ARRAY_LENGTH(buffer),
7066 NULL, pusedDefaultChar);
7067 if (outsize > 0) {
7068 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7069 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007070 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007071 memcpy(out, buffer, outsize);
7072 out += outsize;
7073 continue;
7074 }
7075 }
7076 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7077 PyErr_SetFromWindowsErr(0);
7078 goto error;
7079 }
7080
Victor Stinner3a50e702011-10-18 21:21:00 +02007081 rep = unicode_encode_call_errorhandler(
7082 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007083 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007084 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007085 if (rep == NULL)
7086 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007087 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007088
7089 if (PyBytes_Check(rep)) {
7090 outsize = PyBytes_GET_SIZE(rep);
7091 if (outsize != 1) {
7092 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7093 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7094 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7095 Py_DECREF(rep);
7096 goto error;
7097 }
7098 out = PyBytes_AS_STRING(*outbytes) + offset;
7099 }
7100 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7101 out += outsize;
7102 }
7103 else {
7104 Py_ssize_t i;
7105 enum PyUnicode_Kind kind;
7106 void *data;
7107
Benjamin Petersonbac79492012-01-14 13:34:47 -05007108 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007109 Py_DECREF(rep);
7110 goto error;
7111 }
7112
7113 outsize = PyUnicode_GET_LENGTH(rep);
7114 if (outsize != 1) {
7115 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7116 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7117 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7118 Py_DECREF(rep);
7119 goto error;
7120 }
7121 out = PyBytes_AS_STRING(*outbytes) + offset;
7122 }
7123 kind = PyUnicode_KIND(rep);
7124 data = PyUnicode_DATA(rep);
7125 for (i=0; i < outsize; i++) {
7126 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7127 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007128 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007129 encoding, unicode,
7130 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007131 "unable to encode error handler result to ASCII");
7132 Py_DECREF(rep);
7133 goto error;
7134 }
7135 *out = (unsigned char)ch;
7136 out++;
7137 }
7138 }
7139 Py_DECREF(rep);
7140 }
7141 /* write a NUL byte */
7142 *out = 0;
7143 outsize = out - PyBytes_AS_STRING(*outbytes);
7144 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7145 if (_PyBytes_Resize(outbytes, outsize) < 0)
7146 goto error;
7147 ret = 0;
7148
7149error:
7150 Py_XDECREF(encoding_obj);
7151 Py_XDECREF(errorHandler);
7152 Py_XDECREF(exc);
7153 return ret;
7154}
7155
Victor Stinner3a50e702011-10-18 21:21:00 +02007156static PyObject *
7157encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007158 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007159 const char *errors)
7160{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007161 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007162 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007163 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007164 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007165
Benjamin Petersonbac79492012-01-14 13:34:47 -05007166 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007167 return NULL;
7168 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007169
Victor Stinner3a50e702011-10-18 21:21:00 +02007170 if (code_page < 0) {
7171 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7172 return NULL;
7173 }
7174
Martin v. Löwis3d325192011-11-04 18:23:06 +01007175 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007176 return PyBytes_FromStringAndSize(NULL, 0);
7177
Victor Stinner7581cef2011-11-03 22:32:33 +01007178 offset = 0;
7179 do
7180 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007181#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007182 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007183 chunks. */
7184 if (len > INT_MAX/2) {
7185 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007186 done = 0;
7187 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007188 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007189#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007190 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007191 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007192 done = 1;
7193 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007194
Victor Stinner76a31a62011-11-04 00:05:13 +01007195 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007196 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007197 errors);
7198 if (ret == -2)
7199 ret = encode_code_page_errors(code_page, &outbytes,
7200 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007201 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007202 if (ret < 0) {
7203 Py_XDECREF(outbytes);
7204 return NULL;
7205 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007206
Victor Stinner7581cef2011-11-03 22:32:33 +01007207 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007208 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007209 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007210
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 return outbytes;
7212}
7213
7214PyObject *
7215PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7216 Py_ssize_t size,
7217 const char *errors)
7218{
Victor Stinner7581cef2011-11-03 22:32:33 +01007219 PyObject *unicode, *res;
7220 unicode = PyUnicode_FromUnicode(p, size);
7221 if (unicode == NULL)
7222 return NULL;
7223 res = encode_code_page(CP_ACP, unicode, errors);
7224 Py_DECREF(unicode);
7225 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007226}
7227
7228PyObject *
7229PyUnicode_EncodeCodePage(int code_page,
7230 PyObject *unicode,
7231 const char *errors)
7232{
Victor Stinner7581cef2011-11-03 22:32:33 +01007233 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007234}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007235
Alexander Belopolsky40018472011-02-26 01:02:56 +00007236PyObject *
7237PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007238{
7239 if (!PyUnicode_Check(unicode)) {
7240 PyErr_BadArgument();
7241 return NULL;
7242 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007243 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007244}
7245
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007246#undef NEED_RETRY
7247
Victor Stinner99b95382011-07-04 14:23:54 +02007248#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007249
Guido van Rossumd57fd912000-03-10 22:53:23 +00007250/* --- Character Mapping Codec -------------------------------------------- */
7251
Alexander Belopolsky40018472011-02-26 01:02:56 +00007252PyObject *
7253PyUnicode_DecodeCharmap(const char *s,
7254 Py_ssize_t size,
7255 PyObject *mapping,
7256 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007257{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007258 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007259 Py_ssize_t startinpos;
7260 Py_ssize_t endinpos;
7261 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007262 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007263 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007264 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007265 PyObject *errorHandler = NULL;
7266 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007267
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 /* Default to Latin-1 */
7269 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007270 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007272 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007273 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007274 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007276 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007277 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007278 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007279 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007280 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007281 enum PyUnicode_Kind mapkind;
7282 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007283 Py_UCS4 x;
7284
Benjamin Petersonbac79492012-01-14 13:34:47 -05007285 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007286 return NULL;
7287
7288 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007289 mapdata = PyUnicode_DATA(mapping);
7290 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007291 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007292 unsigned char ch;
7293 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7294 enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7295 if (outkind == PyUnicode_1BYTE_KIND) {
7296 void *outdata = PyUnicode_DATA(v);
7297 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7298 while (s < e) {
7299 unsigned char ch = *s;
7300 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7301 if (x > maxchar)
7302 goto Error;
7303 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7304 ++s;
7305 }
7306 break;
7307 }
7308 else if (outkind == PyUnicode_2BYTE_KIND) {
7309 void *outdata = PyUnicode_DATA(v);
7310 while (s < e) {
7311 unsigned char ch = *s;
7312 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7313 if (x == 0xFFFE)
7314 goto Error;
7315 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7316 ++s;
7317 }
7318 break;
7319 }
7320 }
7321 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007322
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007324 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007325 else
7326 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007327Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007328 if (x == 0xfffe)
7329 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007330 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 startinpos = s-starts;
7332 endinpos = startinpos+1;
7333 if (unicode_decode_call_errorhandler(
7334 errors, &errorHandler,
7335 "charmap", "character maps to <undefined>",
7336 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007337 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007338 goto onError;
7339 }
7340 continue;
7341 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007342
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007343 if (unicode_putchar(&v, &outpos, x) < 0)
7344 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007346 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007347 }
7348 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 while (s < e) {
7350 unsigned char ch = *s;
7351 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007352
Benjamin Peterson29060642009-01-31 22:14:21 +00007353 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7354 w = PyLong_FromLong((long)ch);
7355 if (w == NULL)
7356 goto onError;
7357 x = PyObject_GetItem(mapping, w);
7358 Py_DECREF(w);
7359 if (x == NULL) {
7360 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7361 /* No mapping found means: mapping is undefined. */
7362 PyErr_Clear();
7363 x = Py_None;
7364 Py_INCREF(x);
7365 } else
7366 goto onError;
7367 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007368
Benjamin Peterson29060642009-01-31 22:14:21 +00007369 /* Apply mapping */
7370 if (PyLong_Check(x)) {
7371 long value = PyLong_AS_LONG(x);
Antoine Pitroua1f76552012-09-23 20:00:04 +02007372 if (value < 0 || value > MAX_UNICODE) {
7373 PyErr_Format(PyExc_TypeError,
7374 "character mapping must be in range(0x%lx)",
7375 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007376 Py_DECREF(x);
7377 goto onError;
7378 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007379 if (unicode_putchar(&v, &outpos, value) < 0)
7380 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007381 }
7382 else if (x == Py_None) {
7383 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 startinpos = s-starts;
7385 endinpos = startinpos+1;
7386 if (unicode_decode_call_errorhandler(
7387 errors, &errorHandler,
7388 "charmap", "character maps to <undefined>",
7389 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007390 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007391 Py_DECREF(x);
7392 goto onError;
7393 }
7394 Py_DECREF(x);
7395 continue;
7396 }
7397 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007398 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007399
Benjamin Petersonbac79492012-01-14 13:34:47 -05007400 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007401 goto onError;
7402 targetsize = PyUnicode_GET_LENGTH(x);
7403
7404 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007406 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007407 PyUnicode_READ_CHAR(x, 0)) < 0)
7408 goto onError;
7409 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 else if (targetsize > 1) {
7411 /* 1-n mapping */
7412 if (targetsize > extrachars) {
7413 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 Py_ssize_t needed = (targetsize - extrachars) + \
7415 (targetsize << 2);
7416 extrachars += needed;
7417 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007418 if (unicode_resize(&v,
7419 PyUnicode_GET_LENGTH(v) + needed) < 0)
7420 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007421 Py_DECREF(x);
7422 goto onError;
7423 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007424 }
Victor Stinner1b487b42012-05-03 12:29:04 +02007425 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007426 goto onError;
7427 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7428 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007429 extrachars -= targetsize;
7430 }
7431 /* 1-0 mapping: skip the character */
7432 }
7433 else {
7434 /* wrong return value */
7435 PyErr_SetString(PyExc_TypeError,
7436 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007437 Py_DECREF(x);
7438 goto onError;
7439 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007440 Py_DECREF(x);
7441 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007442 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007444 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007445 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007446 Py_XDECREF(errorHandler);
7447 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007448 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007449
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007451 Py_XDECREF(errorHandler);
7452 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453 Py_XDECREF(v);
7454 return NULL;
7455}
7456
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007457/* Charmap encoding: the lookup table */
7458
Alexander Belopolsky40018472011-02-26 01:02:56 +00007459struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 PyObject_HEAD
7461 unsigned char level1[32];
7462 int count2, count3;
7463 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007464};
7465
7466static PyObject*
7467encoding_map_size(PyObject *obj, PyObject* args)
7468{
7469 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007470 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007472}
7473
7474static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007475 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 PyDoc_STR("Return the size (in bytes) of this object") },
7477 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007478};
7479
7480static void
7481encoding_map_dealloc(PyObject* o)
7482{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007483 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007484}
7485
7486static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007487 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007488 "EncodingMap", /*tp_name*/
7489 sizeof(struct encoding_map), /*tp_basicsize*/
7490 0, /*tp_itemsize*/
7491 /* methods */
7492 encoding_map_dealloc, /*tp_dealloc*/
7493 0, /*tp_print*/
7494 0, /*tp_getattr*/
7495 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007496 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007497 0, /*tp_repr*/
7498 0, /*tp_as_number*/
7499 0, /*tp_as_sequence*/
7500 0, /*tp_as_mapping*/
7501 0, /*tp_hash*/
7502 0, /*tp_call*/
7503 0, /*tp_str*/
7504 0, /*tp_getattro*/
7505 0, /*tp_setattro*/
7506 0, /*tp_as_buffer*/
7507 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7508 0, /*tp_doc*/
7509 0, /*tp_traverse*/
7510 0, /*tp_clear*/
7511 0, /*tp_richcompare*/
7512 0, /*tp_weaklistoffset*/
7513 0, /*tp_iter*/
7514 0, /*tp_iternext*/
7515 encoding_map_methods, /*tp_methods*/
7516 0, /*tp_members*/
7517 0, /*tp_getset*/
7518 0, /*tp_base*/
7519 0, /*tp_dict*/
7520 0, /*tp_descr_get*/
7521 0, /*tp_descr_set*/
7522 0, /*tp_dictoffset*/
7523 0, /*tp_init*/
7524 0, /*tp_alloc*/
7525 0, /*tp_new*/
7526 0, /*tp_free*/
7527 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007528};
7529
7530PyObject*
7531PyUnicode_BuildEncodingMap(PyObject* string)
7532{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007533 PyObject *result;
7534 struct encoding_map *mresult;
7535 int i;
7536 int need_dict = 0;
7537 unsigned char level1[32];
7538 unsigned char level2[512];
7539 unsigned char *mlevel1, *mlevel2, *mlevel3;
7540 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007541 int kind;
7542 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007543 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007544 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007545
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007546 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007547 PyErr_BadArgument();
7548 return NULL;
7549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007550 kind = PyUnicode_KIND(string);
7551 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007552 length = PyUnicode_GET_LENGTH(string);
7553 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007554 memset(level1, 0xFF, sizeof level1);
7555 memset(level2, 0xFF, sizeof level2);
7556
7557 /* If there isn't a one-to-one mapping of NULL to \0,
7558 or if there are non-BMP characters, we need to use
7559 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007560 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007561 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007562 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007563 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007564 ch = PyUnicode_READ(kind, data, i);
7565 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007566 need_dict = 1;
7567 break;
7568 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007569 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007570 /* unmapped character */
7571 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007572 l1 = ch >> 11;
7573 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007574 if (level1[l1] == 0xFF)
7575 level1[l1] = count2++;
7576 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007577 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007578 }
7579
7580 if (count2 >= 0xFF || count3 >= 0xFF)
7581 need_dict = 1;
7582
7583 if (need_dict) {
7584 PyObject *result = PyDict_New();
7585 PyObject *key, *value;
7586 if (!result)
7587 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007588 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007589 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007590 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007591 if (!key || !value)
7592 goto failed1;
7593 if (PyDict_SetItem(result, key, value) == -1)
7594 goto failed1;
7595 Py_DECREF(key);
7596 Py_DECREF(value);
7597 }
7598 return result;
7599 failed1:
7600 Py_XDECREF(key);
7601 Py_XDECREF(value);
7602 Py_DECREF(result);
7603 return NULL;
7604 }
7605
7606 /* Create a three-level trie */
7607 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7608 16*count2 + 128*count3 - 1);
7609 if (!result)
7610 return PyErr_NoMemory();
7611 PyObject_Init(result, &EncodingMapType);
7612 mresult = (struct encoding_map*)result;
7613 mresult->count2 = count2;
7614 mresult->count3 = count3;
7615 mlevel1 = mresult->level1;
7616 mlevel2 = mresult->level23;
7617 mlevel3 = mresult->level23 + 16*count2;
7618 memcpy(mlevel1, level1, 32);
7619 memset(mlevel2, 0xFF, 16*count2);
7620 memset(mlevel3, 0, 128*count3);
7621 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007622 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007623 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007624 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7625 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007626 /* unmapped character */
7627 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007628 o1 = ch>>11;
7629 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007630 i2 = 16*mlevel1[o1] + o2;
7631 if (mlevel2[i2] == 0xFF)
7632 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007633 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007634 i3 = 128*mlevel2[i2] + o3;
7635 mlevel3[i3] = i;
7636 }
7637 return result;
7638}
7639
7640static int
Victor Stinner22168992011-11-20 17:09:18 +01007641encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007642{
7643 struct encoding_map *map = (struct encoding_map*)mapping;
7644 int l1 = c>>11;
7645 int l2 = (c>>7) & 0xF;
7646 int l3 = c & 0x7F;
7647 int i;
7648
Victor Stinner22168992011-11-20 17:09:18 +01007649 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007650 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007651 if (c == 0)
7652 return 0;
7653 /* level 1*/
7654 i = map->level1[l1];
7655 if (i == 0xFF) {
7656 return -1;
7657 }
7658 /* level 2*/
7659 i = map->level23[16*i+l2];
7660 if (i == 0xFF) {
7661 return -1;
7662 }
7663 /* level 3 */
7664 i = map->level23[16*map->count2 + 128*i + l3];
7665 if (i == 0) {
7666 return -1;
7667 }
7668 return i;
7669}
7670
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007671/* Lookup the character ch in the mapping. If the character
7672 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007673 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007674static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007675charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676{
Christian Heimes217cfd12007-12-02 14:31:20 +00007677 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007678 PyObject *x;
7679
7680 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007681 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007682 x = PyObject_GetItem(mapping, w);
7683 Py_DECREF(w);
7684 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7686 /* No mapping found means: mapping is undefined. */
7687 PyErr_Clear();
7688 x = Py_None;
7689 Py_INCREF(x);
7690 return x;
7691 } else
7692 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007693 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007694 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007695 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007696 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007697 long value = PyLong_AS_LONG(x);
7698 if (value < 0 || value > 255) {
7699 PyErr_SetString(PyExc_TypeError,
7700 "character mapping must be in range(256)");
7701 Py_DECREF(x);
7702 return NULL;
7703 }
7704 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007705 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007706 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007707 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007709 /* wrong return value */
7710 PyErr_Format(PyExc_TypeError,
7711 "character mapping must return integer, bytes or None, not %.400s",
7712 x->ob_type->tp_name);
7713 Py_DECREF(x);
7714 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715 }
7716}
7717
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007718static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007719charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007720{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007721 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7722 /* exponentially overallocate to minimize reallocations */
7723 if (requiredsize < 2*outsize)
7724 requiredsize = 2*outsize;
7725 if (_PyBytes_Resize(outobj, requiredsize))
7726 return -1;
7727 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007728}
7729
Benjamin Peterson14339b62009-01-31 16:36:08 +00007730typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007731 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007732} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007733/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007734 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007735 space is available. Return a new reference to the object that
7736 was put in the output buffer, or Py_None, if the mapping was undefined
7737 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007738 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007739static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007740charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007741 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007742{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007743 PyObject *rep;
7744 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007745 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007746
Christian Heimes90aa7642007-12-19 02:45:37 +00007747 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007748 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007750 if (res == -1)
7751 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 if (outsize<requiredsize)
7753 if (charmapencode_resize(outobj, outpos, requiredsize))
7754 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007755 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 outstart[(*outpos)++] = (char)res;
7757 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007758 }
7759
7760 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007761 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007762 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007763 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007764 Py_DECREF(rep);
7765 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007766 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007767 if (PyLong_Check(rep)) {
7768 Py_ssize_t requiredsize = *outpos+1;
7769 if (outsize<requiredsize)
7770 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7771 Py_DECREF(rep);
7772 return enc_EXCEPTION;
7773 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007774 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007775 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007776 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007777 else {
7778 const char *repchars = PyBytes_AS_STRING(rep);
7779 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7780 Py_ssize_t requiredsize = *outpos+repsize;
7781 if (outsize<requiredsize)
7782 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7783 Py_DECREF(rep);
7784 return enc_EXCEPTION;
7785 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007786 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 memcpy(outstart + *outpos, repchars, repsize);
7788 *outpos += repsize;
7789 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007790 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007791 Py_DECREF(rep);
7792 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007793}
7794
7795/* handle an error in PyUnicode_EncodeCharmap
7796 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007797static int
7798charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007799 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007800 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007801 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007802 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007803{
7804 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007805 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007806 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007807 enum PyUnicode_Kind kind;
7808 void *data;
7809 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007810 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007811 Py_ssize_t collstartpos = *inpos;
7812 Py_ssize_t collendpos = *inpos+1;
7813 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007814 char *encoding = "charmap";
7815 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007816 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007817 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007818 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007819
Benjamin Petersonbac79492012-01-14 13:34:47 -05007820 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007821 return -1;
7822 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007823 /* find all unencodable characters */
7824 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007825 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007826 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007827 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007828 val = encoding_map_lookup(ch, mapping);
7829 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007830 break;
7831 ++collendpos;
7832 continue;
7833 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007834
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007835 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7836 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007837 if (rep==NULL)
7838 return -1;
7839 else if (rep!=Py_None) {
7840 Py_DECREF(rep);
7841 break;
7842 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007843 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007844 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007845 }
7846 /* cache callback name lookup
7847 * (if not done yet, i.e. it's the first error) */
7848 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007849 if ((errors==NULL) || (!strcmp(errors, "strict")))
7850 *known_errorHandler = 1;
7851 else if (!strcmp(errors, "replace"))
7852 *known_errorHandler = 2;
7853 else if (!strcmp(errors, "ignore"))
7854 *known_errorHandler = 3;
7855 else if (!strcmp(errors, "xmlcharrefreplace"))
7856 *known_errorHandler = 4;
7857 else
7858 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007859 }
7860 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007861 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007862 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007863 return -1;
7864 case 2: /* replace */
7865 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007866 x = charmapencode_output('?', mapping, res, respos);
7867 if (x==enc_EXCEPTION) {
7868 return -1;
7869 }
7870 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007871 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007872 return -1;
7873 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007874 }
7875 /* fall through */
7876 case 3: /* ignore */
7877 *inpos = collendpos;
7878 break;
7879 case 4: /* xmlcharrefreplace */
7880 /* generate replacement (temporarily (mis)uses p) */
7881 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007882 char buffer[2+29+1+1];
7883 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007884 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00007885 for (cp = buffer; *cp; ++cp) {
7886 x = charmapencode_output(*cp, mapping, res, respos);
7887 if (x==enc_EXCEPTION)
7888 return -1;
7889 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007890 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007891 return -1;
7892 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007893 }
7894 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007895 *inpos = collendpos;
7896 break;
7897 default:
7898 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007899 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007901 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007902 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007903 if (PyBytes_Check(repunicode)) {
7904 /* Directly copy bytes result to output. */
7905 Py_ssize_t outsize = PyBytes_Size(*res);
7906 Py_ssize_t requiredsize;
7907 repsize = PyBytes_Size(repunicode);
7908 requiredsize = *respos + repsize;
7909 if (requiredsize > outsize)
7910 /* Make room for all additional bytes. */
7911 if (charmapencode_resize(res, respos, requiredsize)) {
7912 Py_DECREF(repunicode);
7913 return -1;
7914 }
7915 memcpy(PyBytes_AsString(*res) + *respos,
7916 PyBytes_AsString(repunicode), repsize);
7917 *respos += repsize;
7918 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007919 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007920 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007921 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007922 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05007923 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007924 Py_DECREF(repunicode);
7925 return -1;
7926 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01007927 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007928 data = PyUnicode_DATA(repunicode);
7929 kind = PyUnicode_KIND(repunicode);
7930 for (index = 0; index < repsize; index++) {
7931 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7932 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007933 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007934 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 return -1;
7936 }
7937 else if (x==enc_FAILED) {
7938 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007939 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007940 return -1;
7941 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007942 }
7943 *inpos = newpos;
7944 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007945 }
7946 return 0;
7947}
7948
Alexander Belopolsky40018472011-02-26 01:02:56 +00007949PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007950_PyUnicode_EncodeCharmap(PyObject *unicode,
7951 PyObject *mapping,
7952 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007953{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007954 /* output object */
7955 PyObject *res = NULL;
7956 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007957 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007958 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007959 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007960 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007961 PyObject *errorHandler = NULL;
7962 PyObject *exc = NULL;
7963 /* the following variable is used for caching string comparisons
7964 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7965 * 3=ignore, 4=xmlcharrefreplace */
7966 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967
Benjamin Petersonbac79492012-01-14 13:34:47 -05007968 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007969 return NULL;
7970 size = PyUnicode_GET_LENGTH(unicode);
7971
Guido van Rossumd57fd912000-03-10 22:53:23 +00007972 /* Default to Latin-1 */
7973 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007974 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007975
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007976 /* allocate enough for a simple encoding without
7977 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007978 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007979 if (res == NULL)
7980 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007981 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007983
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007984 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007985 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007987 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 if (x==enc_EXCEPTION) /* error */
7989 goto onError;
7990 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007991 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 &exc,
7993 &known_errorHandler, &errorHandler, errors,
7994 &res, &respos)) {
7995 goto onError;
7996 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007997 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007998 else
7999 /* done with this character => adjust input position */
8000 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008003 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008004 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008005 if (_PyBytes_Resize(&res, respos) < 0)
8006 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008007
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008008 Py_XDECREF(exc);
8009 Py_XDECREF(errorHandler);
8010 return res;
8011
Benjamin Peterson29060642009-01-31 22:14:21 +00008012 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008013 Py_XDECREF(res);
8014 Py_XDECREF(exc);
8015 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008016 return NULL;
8017}
8018
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008019/* Deprecated */
8020PyObject *
8021PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8022 Py_ssize_t size,
8023 PyObject *mapping,
8024 const char *errors)
8025{
8026 PyObject *result;
8027 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8028 if (unicode == NULL)
8029 return NULL;
8030 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8031 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008032 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008033}
8034
Alexander Belopolsky40018472011-02-26 01:02:56 +00008035PyObject *
8036PyUnicode_AsCharmapString(PyObject *unicode,
8037 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038{
8039 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 PyErr_BadArgument();
8041 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008043 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008044}
8045
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008046/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008047static void
8048make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008049 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008050 Py_ssize_t startpos, Py_ssize_t endpos,
8051 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008053 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008054 *exceptionObject = _PyUnicodeTranslateError_Create(
8055 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056 }
8057 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8059 goto onError;
8060 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8061 goto onError;
8062 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8063 goto onError;
8064 return;
8065 onError:
8066 Py_DECREF(*exceptionObject);
8067 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068 }
8069}
8070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008071/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008072static void
8073raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008074 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008075 Py_ssize_t startpos, Py_ssize_t endpos,
8076 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008077{
8078 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008079 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008082}
8083
8084/* error handling callback helper:
8085 build arguments, call the callback and check the arguments,
8086 put the result into newpos and return the replacement string, which
8087 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008088static PyObject *
8089unicode_translate_call_errorhandler(const char *errors,
8090 PyObject **errorHandler,
8091 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008092 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008093 Py_ssize_t startpos, Py_ssize_t endpos,
8094 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008095{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008096 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008098 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008099 PyObject *restuple;
8100 PyObject *resunicode;
8101
8102 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008103 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008106 }
8107
8108 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008109 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008110 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008112
8113 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008115 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008116 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008117 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008118 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 Py_DECREF(restuple);
8120 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121 }
8122 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 &resunicode, &i_newpos)) {
8124 Py_DECREF(restuple);
8125 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008126 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008127 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008128 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008129 else
8130 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008131 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008132 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8133 Py_DECREF(restuple);
8134 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008135 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008136 Py_INCREF(resunicode);
8137 Py_DECREF(restuple);
8138 return resunicode;
8139}
8140
8141/* Lookup the character ch in the mapping and put the result in result,
8142 which must be decrefed by the caller.
8143 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008144static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008145charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008146{
Christian Heimes217cfd12007-12-02 14:31:20 +00008147 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008148 PyObject *x;
8149
8150 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008151 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008152 x = PyObject_GetItem(mapping, w);
8153 Py_DECREF(w);
8154 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8156 /* No mapping found means: use 1:1 mapping. */
8157 PyErr_Clear();
8158 *result = NULL;
8159 return 0;
8160 } else
8161 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008162 }
8163 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008164 *result = x;
8165 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008166 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008167 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 long value = PyLong_AS_LONG(x);
8169 long max = PyUnicode_GetMax();
8170 if (value < 0 || value > max) {
8171 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008172 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 Py_DECREF(x);
8174 return -1;
8175 }
8176 *result = x;
8177 return 0;
8178 }
8179 else if (PyUnicode_Check(x)) {
8180 *result = x;
8181 return 0;
8182 }
8183 else {
8184 /* wrong return value */
8185 PyErr_SetString(PyExc_TypeError,
8186 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008187 Py_DECREF(x);
8188 return -1;
8189 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008190}
8191/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008192 if not reallocate and adjust various state variables.
8193 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008194static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008195charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008197{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008198 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008199 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008200 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 /* exponentially overallocate to minimize reallocations */
8202 if (requiredsize < 2 * oldsize)
8203 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008204 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8205 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008206 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008207 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008208 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008209 }
8210 return 0;
8211}
8212/* lookup the character, put the result in the output string and adjust
8213 various state variables. Return a new reference to the object that
8214 was put in the output buffer in *result, or Py_None, if the mapping was
8215 undefined (in which case no character was written).
8216 The called must decref result.
8217 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008218static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008219charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8220 PyObject *mapping, Py_UCS4 **output,
8221 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008222 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008223{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008224 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8225 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008227 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008229 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008230 }
8231 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008233 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008234 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236 }
8237 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008238 Py_ssize_t repsize;
8239 if (PyUnicode_READY(*res) == -1)
8240 return -1;
8241 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 if (repsize==1) {
8243 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008244 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 }
8246 else if (repsize!=0) {
8247 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008248 Py_ssize_t requiredsize = *opos +
8249 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008251 Py_ssize_t i;
8252 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008254 for(i = 0; i < repsize; i++)
8255 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257 }
8258 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 return 0;
8261}
8262
Alexander Belopolsky40018472011-02-26 01:02:56 +00008263PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008264_PyUnicode_TranslateCharmap(PyObject *input,
8265 PyObject *mapping,
8266 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008268 /* input object */
8269 char *idata;
8270 Py_ssize_t size, i;
8271 int kind;
8272 /* output buffer */
8273 Py_UCS4 *output = NULL;
8274 Py_ssize_t osize;
8275 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008276 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008277 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 char *reason = "character maps to <undefined>";
8279 PyObject *errorHandler = NULL;
8280 PyObject *exc = NULL;
8281 /* the following variable is used for caching string comparisons
8282 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8283 * 3=ignore, 4=xmlcharrefreplace */
8284 int known_errorHandler = -1;
8285
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008287 PyErr_BadArgument();
8288 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008289 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008291 if (PyUnicode_READY(input) == -1)
8292 return NULL;
8293 idata = (char*)PyUnicode_DATA(input);
8294 kind = PyUnicode_KIND(input);
8295 size = PyUnicode_GET_LENGTH(input);
8296 i = 0;
8297
8298 if (size == 0) {
8299 Py_INCREF(input);
8300 return input;
8301 }
8302
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008303 /* allocate enough for a simple 1:1 translation without
8304 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008305 osize = size;
8306 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8307 opos = 0;
8308 if (output == NULL) {
8309 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008311 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008313 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 /* try to encode it */
8315 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008316 if (charmaptranslate_output(input, i, mapping,
8317 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 Py_XDECREF(x);
8319 goto onError;
8320 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008321 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008323 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008324 else { /* untranslatable character */
8325 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8326 Py_ssize_t repsize;
8327 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008328 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008330 Py_ssize_t collstart = i;
8331 Py_ssize_t collend = i+1;
8332 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008335 while (collend < size) {
8336 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 goto onError;
8338 Py_XDECREF(x);
8339 if (x!=Py_None)
8340 break;
8341 ++collend;
8342 }
8343 /* cache callback name lookup
8344 * (if not done yet, i.e. it's the first error) */
8345 if (known_errorHandler==-1) {
8346 if ((errors==NULL) || (!strcmp(errors, "strict")))
8347 known_errorHandler = 1;
8348 else if (!strcmp(errors, "replace"))
8349 known_errorHandler = 2;
8350 else if (!strcmp(errors, "ignore"))
8351 known_errorHandler = 3;
8352 else if (!strcmp(errors, "xmlcharrefreplace"))
8353 known_errorHandler = 4;
8354 else
8355 known_errorHandler = 0;
8356 }
8357 switch (known_errorHandler) {
8358 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008359 raise_translate_exception(&exc, input, collstart,
8360 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008361 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 case 2: /* replace */
8363 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008364 for (coll = collstart; coll<collend; coll++)
8365 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 /* fall through */
8367 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008368 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 break;
8370 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008371 /* generate replacement (temporarily (mis)uses i) */
8372 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 char buffer[2+29+1+1];
8374 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8376 if (charmaptranslate_makespace(&output, &osize,
8377 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 goto onError;
8379 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008380 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008382 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008383 break;
8384 default:
8385 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 reason, input, &exc,
8387 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008388 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008390 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008391 Py_DECREF(repunicode);
8392 goto onError;
8393 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008394 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008395 repsize = PyUnicode_GET_LENGTH(repunicode);
8396 if (charmaptranslate_makespace(&output, &osize,
8397 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 Py_DECREF(repunicode);
8399 goto onError;
8400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401 for (uni2 = 0; repsize-->0; ++uni2)
8402 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8403 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008405 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008406 }
8407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008408 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8409 if (!res)
8410 goto onError;
8411 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 Py_XDECREF(exc);
8413 Py_XDECREF(errorHandler);
8414 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008415
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418 Py_XDECREF(exc);
8419 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420 return NULL;
8421}
8422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008423/* Deprecated. Use PyUnicode_Translate instead. */
8424PyObject *
8425PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8426 Py_ssize_t size,
8427 PyObject *mapping,
8428 const char *errors)
8429{
Christian Heimes5f520f42012-09-11 14:03:25 +02008430 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008431 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8432 if (!unicode)
8433 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008434 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8435 Py_DECREF(unicode);
8436 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008437}
8438
Alexander Belopolsky40018472011-02-26 01:02:56 +00008439PyObject *
8440PyUnicode_Translate(PyObject *str,
8441 PyObject *mapping,
8442 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443{
8444 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008445
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 str = PyUnicode_FromObject(str);
8447 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008448 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008449 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450 Py_DECREF(str);
8451 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008452}
Tim Petersced69f82003-09-16 20:30:58 +00008453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008455fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008456{
8457 /* No need to call PyUnicode_READY(self) because this function is only
8458 called as a callback from fixup() which does it already. */
8459 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8460 const int kind = PyUnicode_KIND(self);
8461 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008462 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008463 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464 Py_ssize_t i;
8465
8466 for (i = 0; i < len; ++i) {
8467 ch = PyUnicode_READ(kind, data, i);
8468 fixed = 0;
8469 if (ch > 127) {
8470 if (Py_UNICODE_ISSPACE(ch))
8471 fixed = ' ';
8472 else {
8473 const int decimal = Py_UNICODE_TODECIMAL(ch);
8474 if (decimal >= 0)
8475 fixed = '0' + decimal;
8476 }
8477 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008478 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008479 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008480 PyUnicode_WRITE(kind, data, i, fixed);
8481 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008482 else
8483 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008485 }
8486
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008487 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008488}
8489
8490PyObject *
8491_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8492{
8493 if (!PyUnicode_Check(unicode)) {
8494 PyErr_BadInternalCall();
8495 return NULL;
8496 }
8497 if (PyUnicode_READY(unicode) == -1)
8498 return NULL;
8499 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8500 /* If the string is already ASCII, just return the same string */
8501 Py_INCREF(unicode);
8502 return unicode;
8503 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008504 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505}
8506
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008507PyObject *
8508PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8509 Py_ssize_t length)
8510{
Victor Stinnerf0124502011-11-21 23:12:56 +01008511 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008512 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008513 Py_UCS4 maxchar;
8514 enum PyUnicode_Kind kind;
8515 void *data;
8516
Victor Stinner99d7ad02012-02-22 13:37:39 +01008517 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008518 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008519 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008520 if (ch > 127) {
8521 int decimal = Py_UNICODE_TODECIMAL(ch);
8522 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008523 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008524 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008525 }
8526 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008527
8528 /* Copy to a new string */
8529 decimal = PyUnicode_New(length, maxchar);
8530 if (decimal == NULL)
8531 return decimal;
8532 kind = PyUnicode_KIND(decimal);
8533 data = PyUnicode_DATA(decimal);
8534 /* Iterate over code points */
8535 for (i = 0; i < length; i++) {
8536 Py_UNICODE ch = s[i];
8537 if (ch > 127) {
8538 int decimal = Py_UNICODE_TODECIMAL(ch);
8539 if (decimal >= 0)
8540 ch = '0' + decimal;
8541 }
8542 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008543 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008544 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008545}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008546/* --- Decimal Encoder ---------------------------------------------------- */
8547
Alexander Belopolsky40018472011-02-26 01:02:56 +00008548int
8549PyUnicode_EncodeDecimal(Py_UNICODE *s,
8550 Py_ssize_t length,
8551 char *output,
8552 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008553{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008554 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008555 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008556 enum PyUnicode_Kind kind;
8557 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008558
8559 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008560 PyErr_BadArgument();
8561 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008562 }
8563
Victor Stinner42bf7752011-11-21 22:52:58 +01008564 unicode = PyUnicode_FromUnicode(s, length);
8565 if (unicode == NULL)
8566 return -1;
8567
Benjamin Petersonbac79492012-01-14 13:34:47 -05008568 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008569 Py_DECREF(unicode);
8570 return -1;
8571 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008572 kind = PyUnicode_KIND(unicode);
8573 data = PyUnicode_DATA(unicode);
8574
Victor Stinnerb84d7232011-11-22 01:50:07 +01008575 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008576 PyObject *exc;
8577 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008579 Py_ssize_t startpos;
8580
8581 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008582
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008584 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008585 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008587 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008588 decimal = Py_UNICODE_TODECIMAL(ch);
8589 if (decimal >= 0) {
8590 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008591 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008592 continue;
8593 }
8594 if (0 < ch && ch < 256) {
8595 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008596 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 continue;
8598 }
Victor Stinner6345be92011-11-25 20:09:01 +01008599
Victor Stinner42bf7752011-11-21 22:52:58 +01008600 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008601 exc = NULL;
8602 raise_encode_exception(&exc, "decimal", unicode,
8603 startpos, startpos+1,
8604 "invalid decimal Unicode string");
8605 Py_XDECREF(exc);
8606 Py_DECREF(unicode);
8607 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008608 }
8609 /* 0-terminate the output string */
8610 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008611 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008612 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008613}
8614
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615/* --- Helpers ------------------------------------------------------------ */
8616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008617static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008618any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 Py_ssize_t start,
8620 Py_ssize_t end)
8621{
8622 int kind1, kind2, kind;
8623 void *buf1, *buf2;
8624 Py_ssize_t len1, len2, result;
8625
8626 kind1 = PyUnicode_KIND(s1);
8627 kind2 = PyUnicode_KIND(s2);
8628 kind = kind1 > kind2 ? kind1 : kind2;
8629 buf1 = PyUnicode_DATA(s1);
8630 buf2 = PyUnicode_DATA(s2);
8631 if (kind1 != kind)
8632 buf1 = _PyUnicode_AsKind(s1, kind);
8633 if (!buf1)
8634 return -2;
8635 if (kind2 != kind)
8636 buf2 = _PyUnicode_AsKind(s2, kind);
8637 if (!buf2) {
8638 if (kind1 != kind) PyMem_Free(buf1);
8639 return -2;
8640 }
8641 len1 = PyUnicode_GET_LENGTH(s1);
8642 len2 = PyUnicode_GET_LENGTH(s2);
8643
Victor Stinner794d5672011-10-10 03:21:36 +02008644 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008645 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008646 case PyUnicode_1BYTE_KIND:
8647 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8648 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8649 else
8650 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8651 break;
8652 case PyUnicode_2BYTE_KIND:
8653 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8654 break;
8655 case PyUnicode_4BYTE_KIND:
8656 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8657 break;
8658 default:
8659 assert(0); result = -2;
8660 }
8661 }
8662 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008663 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008664 case PyUnicode_1BYTE_KIND:
8665 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8666 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8667 else
8668 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8669 break;
8670 case PyUnicode_2BYTE_KIND:
8671 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8672 break;
8673 case PyUnicode_4BYTE_KIND:
8674 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8675 break;
8676 default:
8677 assert(0); result = -2;
8678 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 }
8680
8681 if (kind1 != kind)
8682 PyMem_Free(buf1);
8683 if (kind2 != kind)
8684 PyMem_Free(buf2);
8685
8686 return result;
8687}
8688
8689Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008690_PyUnicode_InsertThousandsGrouping(
8691 PyObject *unicode, Py_ssize_t index,
8692 Py_ssize_t n_buffer,
8693 void *digits, Py_ssize_t n_digits,
8694 Py_ssize_t min_width,
8695 const char *grouping, PyObject *thousands_sep,
8696 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697{
Victor Stinner41a863c2012-02-24 00:37:51 +01008698 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008699 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008700 Py_ssize_t thousands_sep_len;
8701 Py_ssize_t len;
8702
8703 if (unicode != NULL) {
8704 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008705 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008706 }
8707 else {
8708 kind = PyUnicode_1BYTE_KIND;
8709 data = NULL;
8710 }
8711 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8712 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8713 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8714 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008715 if (thousands_sep_kind < kind) {
8716 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8717 if (!thousands_sep_data)
8718 return -1;
8719 }
8720 else {
8721 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8722 if (!data)
8723 return -1;
8724 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008725 }
8726
Benjamin Petersonead6b532011-12-20 17:23:42 -06008727 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008728 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008729 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008730 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008731 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008732 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008733 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008734 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008735 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008736 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008737 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008738 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008739 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008740 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008741 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008742 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008743 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008744 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008745 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008747 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008748 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008749 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008750 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008751 break;
8752 default:
8753 assert(0);
8754 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008756 if (unicode != NULL && thousands_sep_kind != kind) {
8757 if (thousands_sep_kind < kind)
8758 PyMem_Free(thousands_sep_data);
8759 else
8760 PyMem_Free(data);
8761 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008762 if (unicode == NULL) {
8763 *maxchar = 127;
8764 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008765 *maxchar = MAX_MAXCHAR(*maxchar,
8766 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008767 }
8768 }
8769 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770}
8771
8772
Thomas Wouters477c8d52006-05-27 19:21:47 +00008773/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008774#define ADJUST_INDICES(start, end, len) \
8775 if (end > len) \
8776 end = len; \
8777 else if (end < 0) { \
8778 end += len; \
8779 if (end < 0) \
8780 end = 0; \
8781 } \
8782 if (start < 0) { \
8783 start += len; \
8784 if (start < 0) \
8785 start = 0; \
8786 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008787
Alexander Belopolsky40018472011-02-26 01:02:56 +00008788Py_ssize_t
8789PyUnicode_Count(PyObject *str,
8790 PyObject *substr,
8791 Py_ssize_t start,
8792 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008793{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008794 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008795 PyObject* str_obj;
8796 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797 int kind1, kind2, kind;
8798 void *buf1 = NULL, *buf2 = NULL;
8799 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008800
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008801 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008802 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008804 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008805 if (!sub_obj) {
8806 Py_DECREF(str_obj);
8807 return -1;
8808 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008809 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008810 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 Py_DECREF(str_obj);
8812 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008813 }
Tim Petersced69f82003-09-16 20:30:58 +00008814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008815 kind1 = PyUnicode_KIND(str_obj);
8816 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008817 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008818 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008820 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008821 if (kind2 > kind) {
8822 Py_DECREF(sub_obj);
8823 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008824 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008825 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008826 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008827 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008828 if (!buf2)
8829 goto onError;
8830 len1 = PyUnicode_GET_LENGTH(str_obj);
8831 len2 = PyUnicode_GET_LENGTH(sub_obj);
8832
8833 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008834 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008835 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008836 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8837 result = asciilib_count(
8838 ((Py_UCS1*)buf1) + start, end - start,
8839 buf2, len2, PY_SSIZE_T_MAX
8840 );
8841 else
8842 result = ucs1lib_count(
8843 ((Py_UCS1*)buf1) + start, end - start,
8844 buf2, len2, PY_SSIZE_T_MAX
8845 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008846 break;
8847 case PyUnicode_2BYTE_KIND:
8848 result = ucs2lib_count(
8849 ((Py_UCS2*)buf1) + start, end - start,
8850 buf2, len2, PY_SSIZE_T_MAX
8851 );
8852 break;
8853 case PyUnicode_4BYTE_KIND:
8854 result = ucs4lib_count(
8855 ((Py_UCS4*)buf1) + start, end - start,
8856 buf2, len2, PY_SSIZE_T_MAX
8857 );
8858 break;
8859 default:
8860 assert(0); result = 0;
8861 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008862
8863 Py_DECREF(sub_obj);
8864 Py_DECREF(str_obj);
8865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866 if (kind2 != kind)
8867 PyMem_Free(buf2);
8868
Guido van Rossumd57fd912000-03-10 22:53:23 +00008869 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008870 onError:
8871 Py_DECREF(sub_obj);
8872 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873 if (kind2 != kind && buf2)
8874 PyMem_Free(buf2);
8875 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008876}
8877
Alexander Belopolsky40018472011-02-26 01:02:56 +00008878Py_ssize_t
8879PyUnicode_Find(PyObject *str,
8880 PyObject *sub,
8881 Py_ssize_t start,
8882 Py_ssize_t end,
8883 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008885 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008886
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008888 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00008889 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008890 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008891 if (!sub) {
8892 Py_DECREF(str);
8893 return -2;
8894 }
8895 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8896 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 Py_DECREF(str);
8898 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899 }
Tim Petersced69f82003-09-16 20:30:58 +00008900
Victor Stinner794d5672011-10-10 03:21:36 +02008901 result = any_find_slice(direction,
8902 str, sub, start, end
8903 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008904
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008906 Py_DECREF(sub);
8907
Guido van Rossumd57fd912000-03-10 22:53:23 +00008908 return result;
8909}
8910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008911Py_ssize_t
8912PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8913 Py_ssize_t start, Py_ssize_t end,
8914 int direction)
8915{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008917 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008918 if (PyUnicode_READY(str) == -1)
8919 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008920 if (start < 0 || end < 0) {
8921 PyErr_SetString(PyExc_IndexError, "string index out of range");
8922 return -2;
8923 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 if (end > PyUnicode_GET_LENGTH(str))
8925 end = PyUnicode_GET_LENGTH(str);
8926 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008927 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8928 kind, end-start, ch, direction);
8929 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008930 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008931 else
8932 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008933}
8934
Alexander Belopolsky40018472011-02-26 01:02:56 +00008935static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008936tailmatch(PyObject *self,
8937 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008938 Py_ssize_t start,
8939 Py_ssize_t end,
8940 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 int kind_self;
8943 int kind_sub;
8944 void *data_self;
8945 void *data_sub;
8946 Py_ssize_t offset;
8947 Py_ssize_t i;
8948 Py_ssize_t end_sub;
8949
8950 if (PyUnicode_READY(self) == -1 ||
8951 PyUnicode_READY(substring) == -1)
8952 return 0;
8953
8954 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008955 return 1;
8956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008957 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8958 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008960 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962 kind_self = PyUnicode_KIND(self);
8963 data_self = PyUnicode_DATA(self);
8964 kind_sub = PyUnicode_KIND(substring);
8965 data_sub = PyUnicode_DATA(substring);
8966 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8967
8968 if (direction > 0)
8969 offset = end;
8970 else
8971 offset = start;
8972
8973 if (PyUnicode_READ(kind_self, data_self, offset) ==
8974 PyUnicode_READ(kind_sub, data_sub, 0) &&
8975 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8976 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8977 /* If both are of the same kind, memcmp is sufficient */
8978 if (kind_self == kind_sub) {
8979 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008980 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008981 data_sub,
8982 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008983 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984 }
8985 /* otherwise we have to compare each character by first accesing it */
8986 else {
8987 /* We do not need to compare 0 and len(substring)-1 because
8988 the if statement above ensured already that they are equal
8989 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02008990 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991 for (i = 1; i < end_sub; ++i) {
8992 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8993 PyUnicode_READ(kind_sub, data_sub, i))
8994 return 0;
8995 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008996 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008998 }
8999
9000 return 0;
9001}
9002
Alexander Belopolsky40018472011-02-26 01:02:56 +00009003Py_ssize_t
9004PyUnicode_Tailmatch(PyObject *str,
9005 PyObject *substr,
9006 Py_ssize_t start,
9007 Py_ssize_t end,
9008 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009009{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009010 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009011
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012 str = PyUnicode_FromObject(str);
9013 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009014 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015 substr = PyUnicode_FromObject(substr);
9016 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009017 Py_DECREF(str);
9018 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 }
Tim Petersced69f82003-09-16 20:30:58 +00009020
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009021 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009022 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023 Py_DECREF(str);
9024 Py_DECREF(substr);
9025 return result;
9026}
9027
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028/* Apply fixfct filter to the Unicode object self and return a
9029 reference to the modified object */
9030
Alexander Belopolsky40018472011-02-26 01:02:56 +00009031static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009032fixup(PyObject *self,
9033 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009034{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035 PyObject *u;
9036 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009037 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009039 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009040 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009041 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009042 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 /* fix functions return the new maximum character in a string,
9045 if the kind of the resulting unicode object does not change,
9046 everything is fine. Otherwise we need to change the string kind
9047 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009048 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009049
9050 if (maxchar_new == 0) {
9051 /* no changes */;
9052 if (PyUnicode_CheckExact(self)) {
9053 Py_DECREF(u);
9054 Py_INCREF(self);
9055 return self;
9056 }
9057 else
9058 return u;
9059 }
9060
Victor Stinnere6abb482012-05-02 01:15:40 +02009061 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062
Victor Stinnereaab6042011-12-11 22:22:39 +01009063 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009065
9066 /* In case the maximum character changed, we need to
9067 convert the string to the new category. */
9068 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9069 if (v == NULL) {
9070 Py_DECREF(u);
9071 return NULL;
9072 }
9073 if (maxchar_new > maxchar_old) {
9074 /* If the maxchar increased so that the kind changed, not all
9075 characters are representable anymore and we need to fix the
9076 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009077 _PyUnicode_FastCopyCharacters(v, 0,
9078 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009079 maxchar_old = fixfct(v);
9080 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009081 }
9082 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009083 _PyUnicode_FastCopyCharacters(v, 0,
9084 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009086 Py_DECREF(u);
9087 assert(_PyUnicode_CheckConsistency(v, 1));
9088 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089}
9090
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009091static PyObject *
9092ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009094 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9095 char *resdata, *data = PyUnicode_DATA(self);
9096 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009097
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009098 res = PyUnicode_New(len, 127);
9099 if (res == NULL)
9100 return NULL;
9101 resdata = PyUnicode_DATA(res);
9102 if (lower)
9103 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009105 _Py_bytes_upper(resdata, data, len);
9106 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107}
9108
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009109static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009110handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009112 Py_ssize_t j;
9113 int final_sigma;
9114 Py_UCS4 c;
9115 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009116
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009117 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9118
9119 where ! is a negation and \p{xxx} is a character with property xxx.
9120 */
9121 for (j = i - 1; j >= 0; j--) {
9122 c = PyUnicode_READ(kind, data, j);
9123 if (!_PyUnicode_IsCaseIgnorable(c))
9124 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009126 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9127 if (final_sigma) {
9128 for (j = i + 1; j < length; j++) {
9129 c = PyUnicode_READ(kind, data, j);
9130 if (!_PyUnicode_IsCaseIgnorable(c))
9131 break;
9132 }
9133 final_sigma = j == length || !_PyUnicode_IsCased(c);
9134 }
9135 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136}
9137
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009138static int
9139lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9140 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009142 /* Obscure special case. */
9143 if (c == 0x3A3) {
9144 mapped[0] = handle_capital_sigma(kind, data, length, i);
9145 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009147 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148}
9149
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009150static Py_ssize_t
9151do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009153 Py_ssize_t i, k = 0;
9154 int n_res, j;
9155 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009156
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009157 c = PyUnicode_READ(kind, data, 0);
9158 n_res = _PyUnicode_ToUpperFull(c, mapped);
9159 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009160 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009161 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009162 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009163 for (i = 1; i < length; i++) {
9164 c = PyUnicode_READ(kind, data, i);
9165 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9166 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009167 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009168 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009169 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009170 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009171 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009172}
9173
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009174static Py_ssize_t
9175do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9176 Py_ssize_t i, k = 0;
9177
9178 for (i = 0; i < length; i++) {
9179 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9180 int n_res, j;
9181 if (Py_UNICODE_ISUPPER(c)) {
9182 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9183 }
9184 else if (Py_UNICODE_ISLOWER(c)) {
9185 n_res = _PyUnicode_ToUpperFull(c, mapped);
9186 }
9187 else {
9188 n_res = 1;
9189 mapped[0] = c;
9190 }
9191 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009192 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009193 res[k++] = mapped[j];
9194 }
9195 }
9196 return k;
9197}
9198
9199static Py_ssize_t
9200do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9201 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009203 Py_ssize_t i, k = 0;
9204
9205 for (i = 0; i < length; i++) {
9206 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9207 int n_res, j;
9208 if (lower)
9209 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9210 else
9211 n_res = _PyUnicode_ToUpperFull(c, mapped);
9212 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009213 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009214 res[k++] = mapped[j];
9215 }
9216 }
9217 return k;
9218}
9219
9220static Py_ssize_t
9221do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9222{
9223 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9224}
9225
9226static Py_ssize_t
9227do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9228{
9229 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9230}
9231
Benjamin Petersone51757f2012-01-12 21:10:29 -05009232static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009233do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9234{
9235 Py_ssize_t i, k = 0;
9236
9237 for (i = 0; i < length; i++) {
9238 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9239 Py_UCS4 mapped[3];
9240 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9241 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009242 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009243 res[k++] = mapped[j];
9244 }
9245 }
9246 return k;
9247}
9248
9249static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009250do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9251{
9252 Py_ssize_t i, k = 0;
9253 int previous_is_cased;
9254
9255 previous_is_cased = 0;
9256 for (i = 0; i < length; i++) {
9257 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9258 Py_UCS4 mapped[3];
9259 int n_res, j;
9260
9261 if (previous_is_cased)
9262 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9263 else
9264 n_res = _PyUnicode_ToTitleFull(c, mapped);
9265
9266 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009267 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009268 res[k++] = mapped[j];
9269 }
9270
9271 previous_is_cased = _PyUnicode_IsCased(c);
9272 }
9273 return k;
9274}
9275
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009276static PyObject *
9277case_operation(PyObject *self,
9278 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9279{
9280 PyObject *res = NULL;
9281 Py_ssize_t length, newlength = 0;
9282 int kind, outkind;
9283 void *data, *outdata;
9284 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9285
Benjamin Petersoneea48462012-01-16 14:28:50 -05009286 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009287
9288 kind = PyUnicode_KIND(self);
9289 data = PyUnicode_DATA(self);
9290 length = PyUnicode_GET_LENGTH(self);
9291 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9292 if (tmp == NULL)
9293 return PyErr_NoMemory();
9294 newlength = perform(kind, data, length, tmp, &maxchar);
9295 res = PyUnicode_New(newlength, maxchar);
9296 if (res == NULL)
9297 goto leave;
9298 tmpend = tmp + newlength;
9299 outdata = PyUnicode_DATA(res);
9300 outkind = PyUnicode_KIND(res);
9301 switch (outkind) {
9302 case PyUnicode_1BYTE_KIND:
9303 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9304 break;
9305 case PyUnicode_2BYTE_KIND:
9306 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9307 break;
9308 case PyUnicode_4BYTE_KIND:
9309 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9310 break;
9311 default:
9312 assert(0);
9313 break;
9314 }
9315 leave:
9316 PyMem_FREE(tmp);
9317 return res;
9318}
9319
Tim Peters8ce9f162004-08-27 01:49:32 +00009320PyObject *
9321PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009323 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009324 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009325 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009326 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009327 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9328 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009329 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009331 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009332 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009333 int use_memcpy;
9334 unsigned char *res_data = NULL, *sep_data = NULL;
9335 PyObject *last_obj;
9336 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337
Tim Peters05eba1f2004-08-27 21:32:02 +00009338 fseq = PySequence_Fast(seq, "");
9339 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009340 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009341 }
9342
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009343 /* NOTE: the following code can't call back into Python code,
9344 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009345 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009346
Tim Peters05eba1f2004-08-27 21:32:02 +00009347 seqlen = PySequence_Fast_GET_SIZE(fseq);
9348 /* If empty sequence, return u"". */
9349 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009350 Py_DECREF(fseq);
9351 Py_INCREF(unicode_empty);
9352 res = unicode_empty;
9353 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009354 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009355
Tim Peters05eba1f2004-08-27 21:32:02 +00009356 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009357 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009358 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009359 if (seqlen == 1) {
9360 if (PyUnicode_CheckExact(items[0])) {
9361 res = items[0];
9362 Py_INCREF(res);
9363 Py_DECREF(fseq);
9364 return res;
9365 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009366 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009367 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009368 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009369 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009370 /* Set up sep and seplen */
9371 if (separator == NULL) {
9372 /* fall back to a blank space separator */
9373 sep = PyUnicode_FromOrdinal(' ');
9374 if (!sep)
9375 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009376 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009377 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009378 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009379 else {
9380 if (!PyUnicode_Check(separator)) {
9381 PyErr_Format(PyExc_TypeError,
9382 "separator: expected str instance,"
9383 " %.80s found",
9384 Py_TYPE(separator)->tp_name);
9385 goto onError;
9386 }
9387 if (PyUnicode_READY(separator))
9388 goto onError;
9389 sep = separator;
9390 seplen = PyUnicode_GET_LENGTH(separator);
9391 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9392 /* inc refcount to keep this code path symmetric with the
9393 above case of a blank separator */
9394 Py_INCREF(sep);
9395 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009396 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009397 }
9398
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009399 /* There are at least two things to join, or else we have a subclass
9400 * of str in the sequence.
9401 * Do a pre-pass to figure out the total amount of space we'll
9402 * need (sz), and see whether all argument are strings.
9403 */
9404 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009405#ifdef Py_DEBUG
9406 use_memcpy = 0;
9407#else
9408 use_memcpy = 1;
9409#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009410 for (i = 0; i < seqlen; i++) {
9411 const Py_ssize_t old_sz = sz;
9412 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009413 if (!PyUnicode_Check(item)) {
9414 PyErr_Format(PyExc_TypeError,
9415 "sequence item %zd: expected str instance,"
9416 " %.80s found",
9417 i, Py_TYPE(item)->tp_name);
9418 goto onError;
9419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 if (PyUnicode_READY(item) == -1)
9421 goto onError;
9422 sz += PyUnicode_GET_LENGTH(item);
9423 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009424 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009425 if (i != 0)
9426 sz += seplen;
9427 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9428 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009429 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009430 goto onError;
9431 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009432 if (use_memcpy && last_obj != NULL) {
9433 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9434 use_memcpy = 0;
9435 }
9436 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009437 }
Tim Petersced69f82003-09-16 20:30:58 +00009438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009440 if (res == NULL)
9441 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009442
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009443 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009444#ifdef Py_DEBUG
9445 use_memcpy = 0;
9446#else
9447 if (use_memcpy) {
9448 res_data = PyUnicode_1BYTE_DATA(res);
9449 kind = PyUnicode_KIND(res);
9450 if (seplen != 0)
9451 sep_data = PyUnicode_1BYTE_DATA(sep);
9452 }
9453#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009455 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009456 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009457 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009458 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009459 if (use_memcpy) {
9460 Py_MEMCPY(res_data,
9461 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009462 kind * seplen);
9463 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009464 }
9465 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009466 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009467 res_offset += seplen;
9468 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009469 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009470 itemlen = PyUnicode_GET_LENGTH(item);
9471 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009472 if (use_memcpy) {
9473 Py_MEMCPY(res_data,
9474 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009475 kind * itemlen);
9476 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009477 }
9478 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009479 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009480 res_offset += itemlen;
9481 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009482 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009483 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009484 if (use_memcpy)
9485 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009486 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009487 else
9488 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009489
Tim Peters05eba1f2004-08-27 21:32:02 +00009490 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009492 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009494
Benjamin Peterson29060642009-01-31 22:14:21 +00009495 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009496 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009498 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009499 return NULL;
9500}
9501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009502#define FILL(kind, data, value, start, length) \
9503 do { \
9504 Py_ssize_t i_ = 0; \
9505 assert(kind != PyUnicode_WCHAR_KIND); \
9506 switch ((kind)) { \
9507 case PyUnicode_1BYTE_KIND: { \
9508 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009509 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 break; \
9511 } \
9512 case PyUnicode_2BYTE_KIND: { \
9513 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9514 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9515 break; \
9516 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009517 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9519 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9520 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009521 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 } \
9523 } \
9524 } while (0)
9525
Victor Stinnerd3f08822012-05-29 12:57:52 +02009526void
9527_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9528 Py_UCS4 fill_char)
9529{
9530 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9531 const void *data = PyUnicode_DATA(unicode);
9532 assert(PyUnicode_IS_READY(unicode));
9533 assert(unicode_modifiable(unicode));
9534 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9535 assert(start >= 0);
9536 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9537 FILL(kind, data, fill_char, start, length);
9538}
9539
Victor Stinner3fe55312012-01-04 00:33:50 +01009540Py_ssize_t
9541PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9542 Py_UCS4 fill_char)
9543{
9544 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009545
9546 if (!PyUnicode_Check(unicode)) {
9547 PyErr_BadInternalCall();
9548 return -1;
9549 }
9550 if (PyUnicode_READY(unicode) == -1)
9551 return -1;
9552 if (unicode_check_modifiable(unicode))
9553 return -1;
9554
Victor Stinnerd3f08822012-05-29 12:57:52 +02009555 if (start < 0) {
9556 PyErr_SetString(PyExc_IndexError, "string index out of range");
9557 return -1;
9558 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009559 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9560 PyErr_SetString(PyExc_ValueError,
9561 "fill character is bigger than "
9562 "the string maximum character");
9563 return -1;
9564 }
9565
9566 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9567 length = Py_MIN(maxlen, length);
9568 if (length <= 0)
9569 return 0;
9570
Victor Stinnerd3f08822012-05-29 12:57:52 +02009571 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009572 return length;
9573}
9574
Victor Stinner9310abb2011-10-05 00:59:23 +02009575static PyObject *
9576pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009577 Py_ssize_t left,
9578 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009580{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581 PyObject *u;
9582 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009583 int kind;
9584 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009585
9586 if (left < 0)
9587 left = 0;
9588 if (right < 0)
9589 right = 0;
9590
Victor Stinnerc4b49542011-12-11 22:44:26 +01009591 if (left == 0 && right == 0)
9592 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9595 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009596 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9597 return NULL;
9598 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009600 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009602 if (!u)
9603 return NULL;
9604
9605 kind = PyUnicode_KIND(u);
9606 data = PyUnicode_DATA(u);
9607 if (left)
9608 FILL(kind, data, fill, 0, left);
9609 if (right)
9610 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009611 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009612 assert(_PyUnicode_CheckConsistency(u, 1));
9613 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009614}
9615
Alexander Belopolsky40018472011-02-26 01:02:56 +00009616PyObject *
9617PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009618{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009619 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009620
9621 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009622 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009623 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009624 if (PyUnicode_READY(string) == -1) {
9625 Py_DECREF(string);
9626 return NULL;
9627 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628
Benjamin Petersonead6b532011-12-20 17:23:42 -06009629 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009631 if (PyUnicode_IS_ASCII(string))
9632 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009633 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009634 PyUnicode_GET_LENGTH(string), keepends);
9635 else
9636 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009637 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009638 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009639 break;
9640 case PyUnicode_2BYTE_KIND:
9641 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009642 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009643 PyUnicode_GET_LENGTH(string), keepends);
9644 break;
9645 case PyUnicode_4BYTE_KIND:
9646 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009647 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 PyUnicode_GET_LENGTH(string), keepends);
9649 break;
9650 default:
9651 assert(0);
9652 list = 0;
9653 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009654 Py_DECREF(string);
9655 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656}
9657
Alexander Belopolsky40018472011-02-26 01:02:56 +00009658static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009659split(PyObject *self,
9660 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009661 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009662{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009663 int kind1, kind2, kind;
9664 void *buf1, *buf2;
9665 Py_ssize_t len1, len2;
9666 PyObject* out;
9667
Guido van Rossumd57fd912000-03-10 22:53:23 +00009668 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009669 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671 if (PyUnicode_READY(self) == -1)
9672 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009675 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009676 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009677 if (PyUnicode_IS_ASCII(self))
9678 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009679 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009680 PyUnicode_GET_LENGTH(self), maxcount
9681 );
9682 else
9683 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009684 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009685 PyUnicode_GET_LENGTH(self), maxcount
9686 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009687 case PyUnicode_2BYTE_KIND:
9688 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009689 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009690 PyUnicode_GET_LENGTH(self), maxcount
9691 );
9692 case PyUnicode_4BYTE_KIND:
9693 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009694 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695 PyUnicode_GET_LENGTH(self), maxcount
9696 );
9697 default:
9698 assert(0);
9699 return NULL;
9700 }
9701
9702 if (PyUnicode_READY(substring) == -1)
9703 return NULL;
9704
9705 kind1 = PyUnicode_KIND(self);
9706 kind2 = PyUnicode_KIND(substring);
9707 kind = kind1 > kind2 ? kind1 : kind2;
9708 buf1 = PyUnicode_DATA(self);
9709 buf2 = PyUnicode_DATA(substring);
9710 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009711 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009712 if (!buf1)
9713 return NULL;
9714 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009715 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 if (!buf2) {
9717 if (kind1 != kind) PyMem_Free(buf1);
9718 return NULL;
9719 }
9720 len1 = PyUnicode_GET_LENGTH(self);
9721 len2 = PyUnicode_GET_LENGTH(substring);
9722
Benjamin Petersonead6b532011-12-20 17:23:42 -06009723 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009724 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009725 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9726 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009727 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009728 else
9729 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009730 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 break;
9732 case PyUnicode_2BYTE_KIND:
9733 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009734 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 break;
9736 case PyUnicode_4BYTE_KIND:
9737 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009738 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 break;
9740 default:
9741 out = NULL;
9742 }
9743 if (kind1 != kind)
9744 PyMem_Free(buf1);
9745 if (kind2 != kind)
9746 PyMem_Free(buf2);
9747 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748}
9749
Alexander Belopolsky40018472011-02-26 01:02:56 +00009750static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009751rsplit(PyObject *self,
9752 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009753 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009754{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009755 int kind1, kind2, kind;
9756 void *buf1, *buf2;
9757 Py_ssize_t len1, len2;
9758 PyObject* out;
9759
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009760 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009761 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009762
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009763 if (PyUnicode_READY(self) == -1)
9764 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009766 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009767 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009769 if (PyUnicode_IS_ASCII(self))
9770 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009771 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009772 PyUnicode_GET_LENGTH(self), maxcount
9773 );
9774 else
9775 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009776 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009777 PyUnicode_GET_LENGTH(self), maxcount
9778 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 case PyUnicode_2BYTE_KIND:
9780 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009781 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 PyUnicode_GET_LENGTH(self), maxcount
9783 );
9784 case PyUnicode_4BYTE_KIND:
9785 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009786 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 PyUnicode_GET_LENGTH(self), maxcount
9788 );
9789 default:
9790 assert(0);
9791 return NULL;
9792 }
9793
9794 if (PyUnicode_READY(substring) == -1)
9795 return NULL;
9796
9797 kind1 = PyUnicode_KIND(self);
9798 kind2 = PyUnicode_KIND(substring);
9799 kind = kind1 > kind2 ? kind1 : kind2;
9800 buf1 = PyUnicode_DATA(self);
9801 buf2 = PyUnicode_DATA(substring);
9802 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009803 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 if (!buf1)
9805 return NULL;
9806 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009807 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 if (!buf2) {
9809 if (kind1 != kind) PyMem_Free(buf1);
9810 return NULL;
9811 }
9812 len1 = PyUnicode_GET_LENGTH(self);
9813 len2 = PyUnicode_GET_LENGTH(substring);
9814
Benjamin Petersonead6b532011-12-20 17:23:42 -06009815 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009817 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9818 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009819 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009820 else
9821 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009822 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009823 break;
9824 case PyUnicode_2BYTE_KIND:
9825 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009826 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827 break;
9828 case PyUnicode_4BYTE_KIND:
9829 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009830 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 break;
9832 default:
9833 out = NULL;
9834 }
9835 if (kind1 != kind)
9836 PyMem_Free(buf1);
9837 if (kind2 != kind)
9838 PyMem_Free(buf2);
9839 return out;
9840}
9841
9842static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009843anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9844 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009846 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009847 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009848 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9849 return asciilib_find(buf1, len1, buf2, len2, offset);
9850 else
9851 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 case PyUnicode_2BYTE_KIND:
9853 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9854 case PyUnicode_4BYTE_KIND:
9855 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9856 }
9857 assert(0);
9858 return -1;
9859}
9860
9861static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009862anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9863 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009865 switch (kind) {
9866 case PyUnicode_1BYTE_KIND:
9867 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9868 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9869 else
9870 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9871 case PyUnicode_2BYTE_KIND:
9872 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9873 case PyUnicode_4BYTE_KIND:
9874 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9875 }
9876 assert(0);
9877 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009878}
9879
Alexander Belopolsky40018472011-02-26 01:02:56 +00009880static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881replace(PyObject *self, PyObject *str1,
9882 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 PyObject *u;
9885 char *sbuf = PyUnicode_DATA(self);
9886 char *buf1 = PyUnicode_DATA(str1);
9887 char *buf2 = PyUnicode_DATA(str2);
9888 int srelease = 0, release1 = 0, release2 = 0;
9889 int skind = PyUnicode_KIND(self);
9890 int kind1 = PyUnicode_KIND(str1);
9891 int kind2 = PyUnicode_KIND(str2);
9892 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9893 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9894 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009895 int mayshrink;
9896 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009897
9898 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009899 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009901 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009902
Victor Stinner59de0ee2011-10-07 10:01:28 +02009903 if (str1 == str2)
9904 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 if (skind < kind1)
9906 /* substring too wide to be present */
9907 goto nothing;
9908
Victor Stinner49a0a212011-10-12 23:46:10 +02009909 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9910 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9911 /* Replacing str1 with str2 may cause a maxchar reduction in the
9912 result string. */
9913 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +02009914 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009916 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009917 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009918 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009919 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009921 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009922 Py_UCS4 u1, u2;
9923 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +01009924 Py_ssize_t index, pos;
9925 char *src;
9926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009927 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +01009928 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
9929 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009930 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009933 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +02009935 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +01009937
9938 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
9939 index = 0;
9940 src = sbuf;
9941 while (--maxcount)
9942 {
9943 pos++;
9944 src += pos * PyUnicode_KIND(self);
9945 slen -= pos;
9946 index += pos;
9947 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
9948 if (pos < 0)
9949 break;
9950 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
9951 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009952 }
9953 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 int rkind = skind;
9955 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +01009956 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +02009957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009958 if (kind1 < rkind) {
9959 /* widen substring */
9960 buf1 = _PyUnicode_AsKind(str1, rkind);
9961 if (!buf1) goto error;
9962 release1 = 1;
9963 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009964 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009965 if (i < 0)
9966 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009967 if (rkind > kind2) {
9968 /* widen replacement */
9969 buf2 = _PyUnicode_AsKind(str2, rkind);
9970 if (!buf2) goto error;
9971 release2 = 1;
9972 }
9973 else if (rkind < kind2) {
9974 /* widen self and buf1 */
9975 rkind = kind2;
9976 if (release1) PyMem_Free(buf1);
9977 sbuf = _PyUnicode_AsKind(self, rkind);
9978 if (!sbuf) goto error;
9979 srelease = 1;
9980 buf1 = _PyUnicode_AsKind(str1, rkind);
9981 if (!buf1) goto error;
9982 release1 = 1;
9983 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009984 u = PyUnicode_New(slen, maxchar);
9985 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009987 assert(PyUnicode_KIND(u) == rkind);
9988 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +02009989
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009990 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009991 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009992 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009994 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009996
9997 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009998 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009999 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010000 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010001 if (i == -1)
10002 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010003 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010005 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010007 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010009 }
10010 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010012 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 int rkind = skind;
10014 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010017 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 buf1 = _PyUnicode_AsKind(str1, rkind);
10019 if (!buf1) goto error;
10020 release1 = 1;
10021 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010022 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010023 if (n == 0)
10024 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010026 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 buf2 = _PyUnicode_AsKind(str2, rkind);
10028 if (!buf2) goto error;
10029 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010032 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 rkind = kind2;
10034 sbuf = _PyUnicode_AsKind(self, rkind);
10035 if (!sbuf) goto error;
10036 srelease = 1;
10037 if (release1) PyMem_Free(buf1);
10038 buf1 = _PyUnicode_AsKind(str1, rkind);
10039 if (!buf1) goto error;
10040 release1 = 1;
10041 }
10042 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10043 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010044 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 PyErr_SetString(PyExc_OverflowError,
10046 "replace string is too long");
10047 goto error;
10048 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010049 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010050 if (new_size == 0) {
10051 Py_INCREF(unicode_empty);
10052 u = unicode_empty;
10053 goto done;
10054 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010055 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 PyErr_SetString(PyExc_OverflowError,
10057 "replace string is too long");
10058 goto error;
10059 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010060 u = PyUnicode_New(new_size, maxchar);
10061 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010062 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010063 assert(PyUnicode_KIND(u) == rkind);
10064 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 ires = i = 0;
10066 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010067 while (n-- > 0) {
10068 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010069 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010070 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010071 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010072 if (j == -1)
10073 break;
10074 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010075 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010076 memcpy(res + rkind * ires,
10077 sbuf + rkind * i,
10078 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010080 }
10081 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010083 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010084 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010085 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010087 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010091 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010092 memcpy(res + rkind * ires,
10093 sbuf + rkind * i,
10094 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010095 }
10096 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010097 /* interleave */
10098 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010099 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010101 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010103 if (--n <= 0)
10104 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010105 memcpy(res + rkind * ires,
10106 sbuf + rkind * i,
10107 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 ires++;
10109 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010110 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010111 memcpy(res + rkind * ires,
10112 sbuf + rkind * i,
10113 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010114 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010115 }
10116
10117 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010118 unicode_adjust_maxchar(&u);
10119 if (u == NULL)
10120 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010121 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010122
10123 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 if (srelease)
10125 PyMem_FREE(sbuf);
10126 if (release1)
10127 PyMem_FREE(buf1);
10128 if (release2)
10129 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010130 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010132
Benjamin Peterson29060642009-01-31 22:14:21 +000010133 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010134 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 if (srelease)
10136 PyMem_FREE(sbuf);
10137 if (release1)
10138 PyMem_FREE(buf1);
10139 if (release2)
10140 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010141 return unicode_result_unchanged(self);
10142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 error:
10144 if (srelease && sbuf)
10145 PyMem_FREE(sbuf);
10146 if (release1 && buf1)
10147 PyMem_FREE(buf1);
10148 if (release2 && buf2)
10149 PyMem_FREE(buf2);
10150 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010151}
10152
10153/* --- Unicode Object Methods --------------------------------------------- */
10154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010155PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010156 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157\n\
10158Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010159characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160
10161static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010162unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010163{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010164 if (PyUnicode_READY(self) == -1)
10165 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010166 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167}
10168
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010169PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010170 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171\n\
10172Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010173have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
10175static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010176unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010178 if (PyUnicode_READY(self) == -1)
10179 return NULL;
10180 if (PyUnicode_GET_LENGTH(self) == 0)
10181 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010182 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010183}
10184
Benjamin Petersond5890c82012-01-14 13:23:30 -050010185PyDoc_STRVAR(casefold__doc__,
10186 "S.casefold() -> str\n\
10187\n\
10188Return a version of S suitable for caseless comparisons.");
10189
10190static PyObject *
10191unicode_casefold(PyObject *self)
10192{
10193 if (PyUnicode_READY(self) == -1)
10194 return NULL;
10195 if (PyUnicode_IS_ASCII(self))
10196 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010197 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010198}
10199
10200
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010201/* Argument converter. Coerces to a single unicode character */
10202
10203static int
10204convert_uc(PyObject *obj, void *addr)
10205{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010207 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010208
Benjamin Peterson14339b62009-01-31 16:36:08 +000010209 uniobj = PyUnicode_FromObject(obj);
10210 if (uniobj == NULL) {
10211 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010212 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010213 return 0;
10214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010216 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010217 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010218 Py_DECREF(uniobj);
10219 return 0;
10220 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010222 Py_DECREF(uniobj);
10223 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010224}
10225
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010226PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010227 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010228\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010229Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010230done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010231
10232static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010233unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010235 Py_ssize_t marg, left;
10236 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 Py_UCS4 fillchar = ' ';
10238
Victor Stinnere9a29352011-10-01 02:14:59 +020010239 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241
Benjamin Petersonbac79492012-01-14 13:34:47 -050010242 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243 return NULL;
10244
Victor Stinnerc4b49542011-12-11 22:44:26 +010010245 if (PyUnicode_GET_LENGTH(self) >= width)
10246 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247
Victor Stinnerc4b49542011-12-11 22:44:26 +010010248 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010249 left = marg / 2 + (marg & width & 1);
10250
Victor Stinner9310abb2011-10-05 00:59:23 +020010251 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010252}
10253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254/* This function assumes that str1 and str2 are readied by the caller. */
10255
Marc-André Lemburge5034372000-08-08 08:04:29 +000010256static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010257unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010258{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 int kind1, kind2;
10260 void *data1, *data2;
Victor Stinner770e19e2012-10-04 22:59:45 +020010261 Py_ssize_t len1, len2;
10262 Py_ssize_t i, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010263
Victor Stinner90db9c42012-10-04 21:53:50 +020010264 /* a string is equal to itself */
10265 if (str1 == str2)
10266 return 0;
10267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 kind1 = PyUnicode_KIND(str1);
10269 kind2 = PyUnicode_KIND(str2);
10270 data1 = PyUnicode_DATA(str1);
10271 data2 = PyUnicode_DATA(str2);
10272 len1 = PyUnicode_GET_LENGTH(str1);
10273 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010274 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010275
Victor Stinner770e19e2012-10-04 22:59:45 +020010276 if (kind1 == 1 && kind2 == 1) {
10277 int cmp = memcmp(data1, data2, len);
10278 /* normalize result of memcmp() into the range [-1; 1] */
10279 if (cmp < 0)
10280 return -1;
10281 if (cmp > 0)
10282 return 1;
10283 }
10284 else {
10285 for (i = 0; i < len; ++i) {
10286 Py_UCS4 c1, c2;
10287 c1 = PyUnicode_READ(kind1, data1, i);
10288 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010289
Victor Stinner770e19e2012-10-04 22:59:45 +020010290 if (c1 != c2)
10291 return (c1 < c2) ? -1 : 1;
10292 }
Marc-André Lemburge5034372000-08-08 08:04:29 +000010293 }
10294
Victor Stinner770e19e2012-10-04 22:59:45 +020010295 if (len1 == len2)
10296 return 0;
10297 if (len1 < len2)
10298 return -1;
10299 else
10300 return 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010301}
10302
Alexander Belopolsky40018472011-02-26 01:02:56 +000010303int
10304PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010305{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10307 if (PyUnicode_READY(left) == -1 ||
10308 PyUnicode_READY(right) == -1)
10309 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010310 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010312 PyErr_Format(PyExc_TypeError,
10313 "Can't compare %.100s and %.100s",
10314 left->ob_type->tp_name,
10315 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316 return -1;
10317}
10318
Martin v. Löwis5b222132007-06-10 09:51:05 +000010319int
10320PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10321{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 Py_ssize_t i;
10323 int kind;
10324 void *data;
10325 Py_UCS4 chr;
10326
Victor Stinner910337b2011-10-03 03:20:16 +020010327 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 if (PyUnicode_READY(uni) == -1)
10329 return -1;
10330 kind = PyUnicode_KIND(uni);
10331 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010332 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10334 if (chr != str[i])
10335 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010336 /* This check keeps Python strings that end in '\0' from comparing equal
10337 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010339 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010340 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010341 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010342 return 0;
10343}
10344
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010345
Benjamin Peterson29060642009-01-31 22:14:21 +000010346#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010347 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010348
Alexander Belopolsky40018472011-02-26 01:02:56 +000010349PyObject *
10350PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010351{
10352 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010353
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010354 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10355 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 if (PyUnicode_READY(left) == -1 ||
10357 PyUnicode_READY(right) == -1)
10358 return NULL;
10359 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10360 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010361 if (op == Py_EQ) {
10362 Py_INCREF(Py_False);
10363 return Py_False;
10364 }
10365 if (op == Py_NE) {
10366 Py_INCREF(Py_True);
10367 return Py_True;
10368 }
10369 }
Victor Stinner90db9c42012-10-04 21:53:50 +020010370 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010371
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010372 /* Convert the return value to a Boolean */
10373 switch (op) {
10374 case Py_EQ:
10375 v = TEST_COND(result == 0);
10376 break;
10377 case Py_NE:
10378 v = TEST_COND(result != 0);
10379 break;
10380 case Py_LE:
10381 v = TEST_COND(result <= 0);
10382 break;
10383 case Py_GE:
10384 v = TEST_COND(result >= 0);
10385 break;
10386 case Py_LT:
10387 v = TEST_COND(result == -1);
10388 break;
10389 case Py_GT:
10390 v = TEST_COND(result == 1);
10391 break;
10392 default:
10393 PyErr_BadArgument();
10394 return NULL;
10395 }
10396 Py_INCREF(v);
10397 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010398 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010399
Brian Curtindfc80e32011-08-10 20:28:54 -050010400 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010401}
10402
Alexander Belopolsky40018472011-02-26 01:02:56 +000010403int
10404PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010405{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010406 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 int kind1, kind2, kind;
10408 void *buf1, *buf2;
10409 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010410 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010411
10412 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010413 sub = PyUnicode_FromObject(element);
10414 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010415 PyErr_Format(PyExc_TypeError,
10416 "'in <string>' requires string as left operand, not %s",
10417 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010418 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010419 }
10420
Thomas Wouters477c8d52006-05-27 19:21:47 +000010421 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010422 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010423 Py_DECREF(sub);
10424 return -1;
10425 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010426 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10427 Py_DECREF(sub);
10428 Py_DECREF(str);
10429 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010430
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 kind1 = PyUnicode_KIND(str);
10432 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010433 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 buf1 = PyUnicode_DATA(str);
10435 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010436 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010437 if (kind2 > kind) {
10438 Py_DECREF(sub);
10439 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010440 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010441 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010442 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 if (!buf2) {
10445 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010446 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 return -1;
10448 }
10449 len1 = PyUnicode_GET_LENGTH(str);
10450 len2 = PyUnicode_GET_LENGTH(sub);
10451
Benjamin Petersonead6b532011-12-20 17:23:42 -060010452 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 case PyUnicode_1BYTE_KIND:
10454 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10455 break;
10456 case PyUnicode_2BYTE_KIND:
10457 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10458 break;
10459 case PyUnicode_4BYTE_KIND:
10460 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10461 break;
10462 default:
10463 result = -1;
10464 assert(0);
10465 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010466
10467 Py_DECREF(str);
10468 Py_DECREF(sub);
10469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 if (kind2 != kind)
10471 PyMem_Free(buf2);
10472
Guido van Rossum403d68b2000-03-13 15:55:09 +000010473 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010474}
10475
Guido van Rossumd57fd912000-03-10 22:53:23 +000010476/* Concat to string or Unicode object giving a new Unicode object. */
10477
Alexander Belopolsky40018472011-02-26 01:02:56 +000010478PyObject *
10479PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010482 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010483 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484
10485 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010488 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010491 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492
10493 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010494 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010495 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010498 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010499 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010501 }
10502
Victor Stinner488fa492011-12-12 00:01:39 +010010503 u_len = PyUnicode_GET_LENGTH(u);
10504 v_len = PyUnicode_GET_LENGTH(v);
10505 if (u_len > PY_SSIZE_T_MAX - v_len) {
10506 PyErr_SetString(PyExc_OverflowError,
10507 "strings are too large to concat");
10508 goto onError;
10509 }
10510 new_len = u_len + v_len;
10511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010513 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010514 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515
Guido van Rossumd57fd912000-03-10 22:53:23 +000010516 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010517 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010518 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010519 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010520 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10521 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522 Py_DECREF(u);
10523 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010524 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526
Benjamin Peterson29060642009-01-31 22:14:21 +000010527 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528 Py_XDECREF(u);
10529 Py_XDECREF(v);
10530 return NULL;
10531}
10532
Walter Dörwald1ab83302007-05-18 17:15:44 +000010533void
Victor Stinner23e56682011-10-03 03:54:37 +020010534PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010535{
Victor Stinner23e56682011-10-03 03:54:37 +020010536 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010537 Py_UCS4 maxchar, maxchar2;
10538 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010539
10540 if (p_left == NULL) {
10541 if (!PyErr_Occurred())
10542 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010543 return;
10544 }
Victor Stinner23e56682011-10-03 03:54:37 +020010545 left = *p_left;
10546 if (right == NULL || !PyUnicode_Check(left)) {
10547 if (!PyErr_Occurred())
10548 PyErr_BadInternalCall();
10549 goto error;
10550 }
10551
Benjamin Petersonbac79492012-01-14 13:34:47 -050010552 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010553 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010554 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010555 goto error;
10556
Victor Stinner488fa492011-12-12 00:01:39 +010010557 /* Shortcuts */
10558 if (left == unicode_empty) {
10559 Py_DECREF(left);
10560 Py_INCREF(right);
10561 *p_left = right;
10562 return;
10563 }
10564 if (right == unicode_empty)
10565 return;
10566
10567 left_len = PyUnicode_GET_LENGTH(left);
10568 right_len = PyUnicode_GET_LENGTH(right);
10569 if (left_len > PY_SSIZE_T_MAX - right_len) {
10570 PyErr_SetString(PyExc_OverflowError,
10571 "strings are too large to concat");
10572 goto error;
10573 }
10574 new_len = left_len + right_len;
10575
10576 if (unicode_modifiable(left)
10577 && PyUnicode_CheckExact(right)
10578 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010579 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10580 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010581 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010582 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010583 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10584 {
10585 /* append inplace */
10586 if (unicode_resize(p_left, new_len) != 0) {
10587 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10588 * deallocated so it cannot be put back into
10589 * 'variable'. The MemoryError is raised when there
10590 * is no value in 'variable', which might (very
10591 * remotely) be a cause of incompatibilities.
10592 */
10593 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010594 }
Victor Stinner488fa492011-12-12 00:01:39 +010010595 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010596 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010597 }
Victor Stinner488fa492011-12-12 00:01:39 +010010598 else {
10599 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10600 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010601 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010602
Victor Stinner488fa492011-12-12 00:01:39 +010010603 /* Concat the two Unicode strings */
10604 res = PyUnicode_New(new_len, maxchar);
10605 if (res == NULL)
10606 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010607 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10608 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010609 Py_DECREF(left);
10610 *p_left = res;
10611 }
10612 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010613 return;
10614
10615error:
Victor Stinner488fa492011-12-12 00:01:39 +010010616 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010617}
10618
10619void
10620PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10621{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010622 PyUnicode_Append(pleft, right);
10623 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010624}
10625
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010626PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010627 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010629Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010630string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010631interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632
10633static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010634unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010635{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010636 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010637 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010638 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 int kind1, kind2, kind;
10641 void *buf1, *buf2;
10642 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643
Jesus Ceaac451502011-04-20 17:09:23 +020010644 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10645 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010646 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 kind1 = PyUnicode_KIND(self);
10649 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010650 if (kind2 > kind1)
10651 return PyLong_FromLong(0);
10652 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 buf1 = PyUnicode_DATA(self);
10654 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010656 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 if (!buf2) {
10658 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 return NULL;
10660 }
10661 len1 = PyUnicode_GET_LENGTH(self);
10662 len2 = PyUnicode_GET_LENGTH(substring);
10663
10664 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010665 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 case PyUnicode_1BYTE_KIND:
10667 iresult = ucs1lib_count(
10668 ((Py_UCS1*)buf1) + start, end - start,
10669 buf2, len2, PY_SSIZE_T_MAX
10670 );
10671 break;
10672 case PyUnicode_2BYTE_KIND:
10673 iresult = ucs2lib_count(
10674 ((Py_UCS2*)buf1) + start, end - start,
10675 buf2, len2, PY_SSIZE_T_MAX
10676 );
10677 break;
10678 case PyUnicode_4BYTE_KIND:
10679 iresult = ucs4lib_count(
10680 ((Py_UCS4*)buf1) + start, end - start,
10681 buf2, len2, PY_SSIZE_T_MAX
10682 );
10683 break;
10684 default:
10685 assert(0); iresult = 0;
10686 }
10687
10688 result = PyLong_FromSsize_t(iresult);
10689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 if (kind2 != kind)
10691 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010692
10693 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010694
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695 return result;
10696}
10697
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010698PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010699 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010701Encode S using the codec registered for encoding. Default encoding\n\
10702is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010703handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010704a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10705'xmlcharrefreplace' as well as any other name registered with\n\
10706codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707
10708static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010709unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010710{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010711 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712 char *encoding = NULL;
10713 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010714
Benjamin Peterson308d6372009-09-18 21:42:35 +000010715 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10716 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010718 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010719}
10720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010721PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010722 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723\n\
10724Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010725If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726
10727static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010728unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010730 Py_ssize_t i, j, line_pos, src_len, incr;
10731 Py_UCS4 ch;
10732 PyObject *u;
10733 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010735 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010736 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737
10738 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010739 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740
Antoine Pitrou22425222011-10-04 19:10:51 +020010741 if (PyUnicode_READY(self) == -1)
10742 return NULL;
10743
Thomas Wouters7e474022000-07-16 12:04:32 +000010744 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010745 src_len = PyUnicode_GET_LENGTH(self);
10746 i = j = line_pos = 0;
10747 kind = PyUnicode_KIND(self);
10748 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010749 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010750 for (; i < src_len; i++) {
10751 ch = PyUnicode_READ(kind, src_data, i);
10752 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010753 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010754 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010755 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010756 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010757 goto overflow;
10758 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010759 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010760 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010761 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010763 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010764 goto overflow;
10765 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010767 if (ch == '\n' || ch == '\r')
10768 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010770 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010771 if (!found)
10772 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010773
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010775 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776 if (!u)
10777 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010778 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779
Antoine Pitroue71d5742011-10-04 15:55:09 +020010780 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781
Antoine Pitroue71d5742011-10-04 15:55:09 +020010782 for (; i < src_len; i++) {
10783 ch = PyUnicode_READ(kind, src_data, i);
10784 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010785 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010786 incr = tabsize - (line_pos % tabsize);
10787 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010788 FILL(kind, dest_data, ' ', j, incr);
10789 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010790 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010791 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010792 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010793 line_pos++;
10794 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010795 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010796 if (ch == '\n' || ch == '\r')
10797 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010799 }
10800 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010801 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010802
Antoine Pitroue71d5742011-10-04 15:55:09 +020010803 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010804 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10805 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010806}
10807
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010808PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010809 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010810\n\
10811Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010812such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010813arguments start and end are interpreted as in slice notation.\n\
10814\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010815Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816
10817static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010818unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010820 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010821 Py_ssize_t start;
10822 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010823 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824
Jesus Ceaac451502011-04-20 17:09:23 +020010825 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10826 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 if (PyUnicode_READY(self) == -1)
10830 return NULL;
10831 if (PyUnicode_READY(substring) == -1)
10832 return NULL;
10833
Victor Stinner7931d9a2011-11-04 00:22:48 +010010834 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835
10836 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 if (result == -2)
10839 return NULL;
10840
Christian Heimes217cfd12007-12-02 14:31:20 +000010841 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842}
10843
10844static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010845unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010847 void *data;
10848 enum PyUnicode_Kind kind;
10849 Py_UCS4 ch;
10850 PyObject *res;
10851
10852 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10853 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010854 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010855 }
10856 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10857 PyErr_SetString(PyExc_IndexError, "string index out of range");
10858 return NULL;
10859 }
10860 kind = PyUnicode_KIND(self);
10861 data = PyUnicode_DATA(self);
10862 ch = PyUnicode_READ(kind, data, index);
10863 if (ch < 256)
10864 return get_latin1_char(ch);
10865
10866 res = PyUnicode_New(1, ch);
10867 if (res == NULL)
10868 return NULL;
10869 kind = PyUnicode_KIND(res);
10870 data = PyUnicode_DATA(res);
10871 PyUnicode_WRITE(kind, data, 0, ch);
10872 assert(_PyUnicode_CheckConsistency(res, 1));
10873 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874}
10875
Guido van Rossumc2504932007-09-18 19:42:40 +000010876/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010877 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010878static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010879unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880{
Guido van Rossumc2504932007-09-18 19:42:40 +000010881 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010882 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010883
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010884#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050010885 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010886#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010887 if (_PyUnicode_HASH(self) != -1)
10888 return _PyUnicode_HASH(self);
10889 if (PyUnicode_READY(self) == -1)
10890 return -1;
10891 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010010892 /*
10893 We make the hash of the empty string be 0, rather than using
10894 (prefix ^ suffix), since this slightly obfuscates the hash secret
10895 */
10896 if (len == 0) {
10897 _PyUnicode_HASH(self) = 0;
10898 return 0;
10899 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010900
10901 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010010902#define HASH(P) \
10903 x ^= (Py_uhash_t) *P << 7; \
10904 while (--len >= 0) \
10905 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010906
Georg Brandl2fb477c2012-02-21 00:33:36 +010010907 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010908 switch (PyUnicode_KIND(self)) {
10909 case PyUnicode_1BYTE_KIND: {
10910 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10911 HASH(c);
10912 break;
10913 }
10914 case PyUnicode_2BYTE_KIND: {
10915 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10916 HASH(s);
10917 break;
10918 }
10919 default: {
10920 Py_UCS4 *l;
10921 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10922 "Impossible switch case in unicode_hash");
10923 l = PyUnicode_4BYTE_DATA(self);
10924 HASH(l);
10925 break;
10926 }
10927 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010010928 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
10929 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930
Guido van Rossumc2504932007-09-18 19:42:40 +000010931 if (x == -1)
10932 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010934 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010938PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010939 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010941Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942
10943static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010946 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010947 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010948 Py_ssize_t start;
10949 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950
Jesus Ceaac451502011-04-20 17:09:23 +020010951 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10952 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 if (PyUnicode_READY(self) == -1)
10956 return NULL;
10957 if (PyUnicode_READY(substring) == -1)
10958 return NULL;
10959
Victor Stinner7931d9a2011-11-04 00:22:48 +010010960 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961
10962 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010964 if (result == -2)
10965 return NULL;
10966
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967 if (result < 0) {
10968 PyErr_SetString(PyExc_ValueError, "substring not found");
10969 return NULL;
10970 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010971
Christian Heimes217cfd12007-12-02 14:31:20 +000010972 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973}
10974
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010975PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010976 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010978Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010979at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980
10981static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010982unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 Py_ssize_t i, length;
10985 int kind;
10986 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987 int cased;
10988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 if (PyUnicode_READY(self) == -1)
10990 return NULL;
10991 length = PyUnicode_GET_LENGTH(self);
10992 kind = PyUnicode_KIND(self);
10993 data = PyUnicode_DATA(self);
10994
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 if (length == 1)
10997 return PyBool_FromLong(
10998 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010999
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011000 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011002 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011003
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 for (i = 0; i < length; i++) {
11006 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011007
Benjamin Peterson29060642009-01-31 22:14:21 +000011008 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11009 return PyBool_FromLong(0);
11010 else if (!cased && Py_UNICODE_ISLOWER(ch))
11011 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011013 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014}
11015
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011016PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011017 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011019Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011020at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021
11022static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011023unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011025 Py_ssize_t i, length;
11026 int kind;
11027 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028 int cased;
11029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 if (PyUnicode_READY(self) == -1)
11031 return NULL;
11032 length = PyUnicode_GET_LENGTH(self);
11033 kind = PyUnicode_KIND(self);
11034 data = PyUnicode_DATA(self);
11035
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 if (length == 1)
11038 return PyBool_FromLong(
11039 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011041 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011043 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011044
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011046 for (i = 0; i < length; i++) {
11047 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011048
Benjamin Peterson29060642009-01-31 22:14:21 +000011049 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11050 return PyBool_FromLong(0);
11051 else if (!cased && Py_UNICODE_ISUPPER(ch))
11052 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011054 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055}
11056
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011057PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011058 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011060Return True if S is a titlecased string and there is at least one\n\
11061character in S, i.e. upper- and titlecase characters may only\n\
11062follow uncased characters and lowercase characters only cased ones.\n\
11063Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064
11065static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011066unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011068 Py_ssize_t i, length;
11069 int kind;
11070 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071 int cased, previous_is_cased;
11072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011073 if (PyUnicode_READY(self) == -1)
11074 return NULL;
11075 length = PyUnicode_GET_LENGTH(self);
11076 kind = PyUnicode_KIND(self);
11077 data = PyUnicode_DATA(self);
11078
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 if (length == 1) {
11081 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11082 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11083 (Py_UNICODE_ISUPPER(ch) != 0));
11084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011086 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011088 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011089
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090 cased = 0;
11091 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 for (i = 0; i < length; i++) {
11093 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011094
Benjamin Peterson29060642009-01-31 22:14:21 +000011095 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11096 if (previous_is_cased)
11097 return PyBool_FromLong(0);
11098 previous_is_cased = 1;
11099 cased = 1;
11100 }
11101 else if (Py_UNICODE_ISLOWER(ch)) {
11102 if (!previous_is_cased)
11103 return PyBool_FromLong(0);
11104 previous_is_cased = 1;
11105 cased = 1;
11106 }
11107 else
11108 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011110 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011111}
11112
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011113PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011114 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011116Return True if all characters in S are whitespace\n\
11117and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118
11119static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011120unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011121{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011122 Py_ssize_t i, length;
11123 int kind;
11124 void *data;
11125
11126 if (PyUnicode_READY(self) == -1)
11127 return NULL;
11128 length = PyUnicode_GET_LENGTH(self);
11129 kind = PyUnicode_KIND(self);
11130 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011133 if (length == 1)
11134 return PyBool_FromLong(
11135 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011137 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011139 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 for (i = 0; i < length; i++) {
11142 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011143 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011144 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011146 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147}
11148
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011149PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011150 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011151\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011152Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011153and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011154
11155static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011156unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011157{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 Py_ssize_t i, length;
11159 int kind;
11160 void *data;
11161
11162 if (PyUnicode_READY(self) == -1)
11163 return NULL;
11164 length = PyUnicode_GET_LENGTH(self);
11165 kind = PyUnicode_KIND(self);
11166 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011167
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011168 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 if (length == 1)
11170 return PyBool_FromLong(
11171 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011172
11173 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011175 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011176
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 for (i = 0; i < length; i++) {
11178 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011179 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011180 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011181 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011182}
11183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011184PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011185 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011186\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011187Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011188and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011189
11190static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011191unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011192{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011193 int kind;
11194 void *data;
11195 Py_ssize_t len, i;
11196
11197 if (PyUnicode_READY(self) == -1)
11198 return NULL;
11199
11200 kind = PyUnicode_KIND(self);
11201 data = PyUnicode_DATA(self);
11202 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011203
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011204 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 if (len == 1) {
11206 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11207 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11208 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011209
11210 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011212 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214 for (i = 0; i < len; i++) {
11215 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011216 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011217 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011218 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011219 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011220}
11221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011222PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011223 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011225Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011226False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227
11228static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011229unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 Py_ssize_t i, length;
11232 int kind;
11233 void *data;
11234
11235 if (PyUnicode_READY(self) == -1)
11236 return NULL;
11237 length = PyUnicode_GET_LENGTH(self);
11238 kind = PyUnicode_KIND(self);
11239 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011242 if (length == 1)
11243 return PyBool_FromLong(
11244 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011246 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011247 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011248 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250 for (i = 0; i < length; i++) {
11251 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011252 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011253 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011254 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255}
11256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011257PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011260Return True if all characters in S are digits\n\
11261and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011262
11263static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011264unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011266 Py_ssize_t i, length;
11267 int kind;
11268 void *data;
11269
11270 if (PyUnicode_READY(self) == -1)
11271 return NULL;
11272 length = PyUnicode_GET_LENGTH(self);
11273 kind = PyUnicode_KIND(self);
11274 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011277 if (length == 1) {
11278 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11279 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11280 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011282 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011284 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011286 for (i = 0; i < length; i++) {
11287 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011288 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011290 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291}
11292
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011293PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011294 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011296Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011297False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298
11299static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011300unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302 Py_ssize_t i, length;
11303 int kind;
11304 void *data;
11305
11306 if (PyUnicode_READY(self) == -1)
11307 return NULL;
11308 length = PyUnicode_GET_LENGTH(self);
11309 kind = PyUnicode_KIND(self);
11310 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311
Guido van Rossumd57fd912000-03-10 22:53:23 +000011312 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313 if (length == 1)
11314 return PyBool_FromLong(
11315 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011317 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011319 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321 for (i = 0; i < length; i++) {
11322 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011323 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011325 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326}
11327
Martin v. Löwis47383402007-08-15 07:32:56 +000011328int
11329PyUnicode_IsIdentifier(PyObject *self)
11330{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 int kind;
11332 void *data;
11333 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011334 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336 if (PyUnicode_READY(self) == -1) {
11337 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011338 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 }
11340
11341 /* Special case for empty strings */
11342 if (PyUnicode_GET_LENGTH(self) == 0)
11343 return 0;
11344 kind = PyUnicode_KIND(self);
11345 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011346
11347 /* PEP 3131 says that the first character must be in
11348 XID_Start and subsequent characters in XID_Continue,
11349 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011350 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011351 letters, digits, underscore). However, given the current
11352 definition of XID_Start and XID_Continue, it is sufficient
11353 to check just for these, except that _ must be allowed
11354 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011356 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011357 return 0;
11358
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011359 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011361 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011362 return 1;
11363}
11364
11365PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011366 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011367\n\
11368Return True if S is a valid identifier according\n\
11369to the language definition.");
11370
11371static PyObject*
11372unicode_isidentifier(PyObject *self)
11373{
11374 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11375}
11376
Georg Brandl559e5d72008-06-11 18:37:52 +000011377PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011378 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011379\n\
11380Return True if all characters in S are considered\n\
11381printable in repr() or S is empty, False otherwise.");
11382
11383static PyObject*
11384unicode_isprintable(PyObject *self)
11385{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386 Py_ssize_t i, length;
11387 int kind;
11388 void *data;
11389
11390 if (PyUnicode_READY(self) == -1)
11391 return NULL;
11392 length = PyUnicode_GET_LENGTH(self);
11393 kind = PyUnicode_KIND(self);
11394 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011395
11396 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 if (length == 1)
11398 return PyBool_FromLong(
11399 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011401 for (i = 0; i < length; i++) {
11402 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011403 Py_RETURN_FALSE;
11404 }
11405 }
11406 Py_RETURN_TRUE;
11407}
11408
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011409PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011410 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411\n\
11412Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011413iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414
11415static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011416unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011418 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419}
11420
Martin v. Löwis18e16552006-02-15 17:27:45 +000011421static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011422unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 if (PyUnicode_READY(self) == -1)
11425 return -1;
11426 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427}
11428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011429PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011430 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011432Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011433done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434
11435static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011436unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011438 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 Py_UCS4 fillchar = ' ';
11440
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011441 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442 return NULL;
11443
Benjamin Petersonbac79492012-01-14 13:34:47 -050011444 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011445 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446
Victor Stinnerc4b49542011-12-11 22:44:26 +010011447 if (PyUnicode_GET_LENGTH(self) >= width)
11448 return unicode_result_unchanged(self);
11449
11450 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451}
11452
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011453PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011454 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011456Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457
11458static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011459unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011461 if (PyUnicode_READY(self) == -1)
11462 return NULL;
11463 if (PyUnicode_IS_ASCII(self))
11464 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011465 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011466}
11467
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011468#define LEFTSTRIP 0
11469#define RIGHTSTRIP 1
11470#define BOTHSTRIP 2
11471
11472/* Arrays indexed by above */
11473static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11474
11475#define STRIPNAME(i) (stripformat[i]+3)
11476
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011477/* externally visible for str.strip(unicode) */
11478PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011479_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 void *data;
11482 int kind;
11483 Py_ssize_t i, j, len;
11484 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11487 return NULL;
11488
11489 kind = PyUnicode_KIND(self);
11490 data = PyUnicode_DATA(self);
11491 len = PyUnicode_GET_LENGTH(self);
11492 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11493 PyUnicode_DATA(sepobj),
11494 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011495
Benjamin Peterson14339b62009-01-31 16:36:08 +000011496 i = 0;
11497 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011498 while (i < len &&
11499 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011500 i++;
11501 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011502 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011503
Benjamin Peterson14339b62009-01-31 16:36:08 +000011504 j = len;
11505 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011506 do {
11507 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508 } while (j >= i &&
11509 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011510 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011511 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011512
Victor Stinner7931d9a2011-11-04 00:22:48 +010011513 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011514}
11515
11516PyObject*
11517PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11518{
11519 unsigned char *data;
11520 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011521 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011522
Victor Stinnerde636f32011-10-01 03:55:54 +020011523 if (PyUnicode_READY(self) == -1)
11524 return NULL;
11525
Victor Stinner684d5fd2012-05-03 02:32:34 +020011526 length = PyUnicode_GET_LENGTH(self);
11527 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011528
Victor Stinner684d5fd2012-05-03 02:32:34 +020011529 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011530 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531
Victor Stinnerde636f32011-10-01 03:55:54 +020011532 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011533 PyErr_SetString(PyExc_IndexError, "string index out of range");
11534 return NULL;
11535 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020011536 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020011537 Py_INCREF(unicode_empty);
11538 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020011539 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020011540
Victor Stinner684d5fd2012-05-03 02:32:34 +020011541 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011542 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011543 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011544 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011545 }
11546 else {
11547 kind = PyUnicode_KIND(self);
11548 data = PyUnicode_1BYTE_DATA(self);
11549 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011550 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011551 length);
11552 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554
11555static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011556do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558 int kind;
11559 void *data;
11560 Py_ssize_t len, i, j;
11561
11562 if (PyUnicode_READY(self) == -1)
11563 return NULL;
11564
11565 kind = PyUnicode_KIND(self);
11566 data = PyUnicode_DATA(self);
11567 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011568
Benjamin Peterson14339b62009-01-31 16:36:08 +000011569 i = 0;
11570 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011572 i++;
11573 }
11574 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011575
Benjamin Peterson14339b62009-01-31 16:36:08 +000011576 j = len;
11577 if (striptype != LEFTSTRIP) {
11578 do {
11579 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011581 j++;
11582 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011583
Victor Stinner7931d9a2011-11-04 00:22:48 +010011584 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585}
11586
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011587
11588static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011589do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011590{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011591 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011592
Benjamin Peterson14339b62009-01-31 16:36:08 +000011593 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11594 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011595
Benjamin Peterson14339b62009-01-31 16:36:08 +000011596 if (sep != NULL && sep != Py_None) {
11597 if (PyUnicode_Check(sep))
11598 return _PyUnicode_XStrip(self, striptype, sep);
11599 else {
11600 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011601 "%s arg must be None or str",
11602 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011603 return NULL;
11604 }
11605 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011606
Benjamin Peterson14339b62009-01-31 16:36:08 +000011607 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011608}
11609
11610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011611PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011612 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011613\n\
11614Return a copy of the string S with leading and trailing\n\
11615whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011616If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011617
11618static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011619unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011620{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011621 if (PyTuple_GET_SIZE(args) == 0)
11622 return do_strip(self, BOTHSTRIP); /* Common case */
11623 else
11624 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011625}
11626
11627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011628PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011629 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011630\n\
11631Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011632If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011633
11634static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011635unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011636{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011637 if (PyTuple_GET_SIZE(args) == 0)
11638 return do_strip(self, LEFTSTRIP); /* Common case */
11639 else
11640 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011641}
11642
11643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011644PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011645 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011646\n\
11647Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011648If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011649
11650static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011651unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011652{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011653 if (PyTuple_GET_SIZE(args) == 0)
11654 return do_strip(self, RIGHTSTRIP); /* Common case */
11655 else
11656 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011657}
11658
11659
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011661unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011663 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011665
Georg Brandl222de0f2009-04-12 12:01:50 +000011666 if (len < 1) {
11667 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011668 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011669 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011670
Victor Stinnerc4b49542011-12-11 22:44:26 +010011671 /* no repeat, return original string */
11672 if (len == 1)
11673 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011674
Benjamin Petersonbac79492012-01-14 13:34:47 -050011675 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 return NULL;
11677
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011678 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011679 PyErr_SetString(PyExc_OverflowError,
11680 "repeated string is too long");
11681 return NULL;
11682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011684
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011685 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686 if (!u)
11687 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011688 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 if (PyUnicode_GET_LENGTH(str) == 1) {
11691 const int kind = PyUnicode_KIND(str);
11692 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011693 if (kind == PyUnicode_1BYTE_KIND) {
11694 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011695 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011696 }
11697 else if (kind == PyUnicode_2BYTE_KIND) {
11698 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011699 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011700 ucs2[n] = fill_char;
11701 } else {
11702 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11703 assert(kind == PyUnicode_4BYTE_KIND);
11704 for (n = 0; n < len; ++n)
11705 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707 }
11708 else {
11709 /* number of characters copied this far */
11710 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011711 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 char *to = (char *) PyUnicode_DATA(u);
11713 Py_MEMCPY(to, PyUnicode_DATA(str),
11714 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011715 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 n = (done <= nchars-done) ? done : nchars-done;
11717 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011718 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011719 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720 }
11721
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011722 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011723 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724}
11725
Alexander Belopolsky40018472011-02-26 01:02:56 +000011726PyObject *
11727PyUnicode_Replace(PyObject *obj,
11728 PyObject *subobj,
11729 PyObject *replobj,
11730 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731{
11732 PyObject *self;
11733 PyObject *str1;
11734 PyObject *str2;
11735 PyObject *result;
11736
11737 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011738 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011739 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011741 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011742 Py_DECREF(self);
11743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744 }
11745 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011746 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011747 Py_DECREF(self);
11748 Py_DECREF(str1);
11749 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011751 if (PyUnicode_READY(self) == -1 ||
11752 PyUnicode_READY(str1) == -1 ||
11753 PyUnicode_READY(str2) == -1)
11754 result = NULL;
11755 else
11756 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757 Py_DECREF(self);
11758 Py_DECREF(str1);
11759 Py_DECREF(str2);
11760 return result;
11761}
11762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011763PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011764 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765\n\
11766Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011767old replaced by new. If the optional argument count is\n\
11768given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769
11770static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773 PyObject *str1;
11774 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011775 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 PyObject *result;
11777
Martin v. Löwis18e16552006-02-15 17:27:45 +000011778 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011780 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011781 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011783 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784 return NULL;
11785 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011786 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011787 Py_DECREF(str1);
11788 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011789 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011790 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11791 result = NULL;
11792 else
11793 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794
11795 Py_DECREF(str1);
11796 Py_DECREF(str2);
11797 return result;
11798}
11799
Alexander Belopolsky40018472011-02-26 01:02:56 +000011800static PyObject *
11801unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011803 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 Py_ssize_t isize;
11805 Py_ssize_t osize, squote, dquote, i, o;
11806 Py_UCS4 max, quote;
11807 int ikind, okind;
11808 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011811 return NULL;
11812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 isize = PyUnicode_GET_LENGTH(unicode);
11814 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 /* Compute length of output, quote characters, and
11817 maximum character */
11818 osize = 2; /* quotes */
11819 max = 127;
11820 squote = dquote = 0;
11821 ikind = PyUnicode_KIND(unicode);
11822 for (i = 0; i < isize; i++) {
11823 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11824 switch (ch) {
11825 case '\'': squote++; osize++; break;
11826 case '"': dquote++; osize++; break;
11827 case '\\': case '\t': case '\r': case '\n':
11828 osize += 2; break;
11829 default:
11830 /* Fast-path ASCII */
11831 if (ch < ' ' || ch == 0x7f)
11832 osize += 4; /* \xHH */
11833 else if (ch < 0x7f)
11834 osize++;
11835 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11836 osize++;
11837 max = ch > max ? ch : max;
11838 }
11839 else if (ch < 0x100)
11840 osize += 4; /* \xHH */
11841 else if (ch < 0x10000)
11842 osize += 6; /* \uHHHH */
11843 else
11844 osize += 10; /* \uHHHHHHHH */
11845 }
11846 }
11847
11848 quote = '\'';
11849 if (squote) {
11850 if (dquote)
11851 /* Both squote and dquote present. Use squote,
11852 and escape them */
11853 osize += squote;
11854 else
11855 quote = '"';
11856 }
11857
11858 repr = PyUnicode_New(osize, max);
11859 if (repr == NULL)
11860 return NULL;
11861 okind = PyUnicode_KIND(repr);
11862 odata = PyUnicode_DATA(repr);
11863
11864 PyUnicode_WRITE(okind, odata, 0, quote);
11865 PyUnicode_WRITE(okind, odata, osize-1, quote);
11866
11867 for (i = 0, o = 1; i < isize; i++) {
11868 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011869
11870 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 if ((ch == quote) || (ch == '\\')) {
11872 PyUnicode_WRITE(okind, odata, o++, '\\');
11873 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011874 continue;
11875 }
11876
Benjamin Peterson29060642009-01-31 22:14:21 +000011877 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011878 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879 PyUnicode_WRITE(okind, odata, o++, '\\');
11880 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011881 }
11882 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883 PyUnicode_WRITE(okind, odata, o++, '\\');
11884 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011885 }
11886 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 PyUnicode_WRITE(okind, odata, o++, '\\');
11888 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011889 }
11890
11891 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011892 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 PyUnicode_WRITE(okind, odata, o++, '\\');
11894 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011895 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11896 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011897 }
11898
Georg Brandl559e5d72008-06-11 18:37:52 +000011899 /* Copy ASCII characters as-is */
11900 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011902 }
11903
Benjamin Peterson29060642009-01-31 22:14:21 +000011904 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011905 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011906 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011907 (categories Z* and C* except ASCII space)
11908 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011910 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000011911 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011914 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11915 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011916 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011917 /* Map 16-bit characters to '\uxxxx' */
11918 else if (ch <= 0xffff) {
11919 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011920 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11921 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11922 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11923 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011924 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011925 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011926 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011927 PyUnicode_WRITE(okind, odata, o++, 'U');
11928 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11929 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11930 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11931 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020011932 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11933 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11934 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11935 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011936 }
11937 }
11938 /* Copy characters as-is */
11939 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011941 }
11942 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011943 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011945 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011946 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947}
11948
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011949PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011950 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951\n\
11952Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011953such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954arguments start and end are interpreted as in slice notation.\n\
11955\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011956Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011957
11958static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011960{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011961 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011962 Py_ssize_t start;
11963 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011964 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965
Jesus Ceaac451502011-04-20 17:09:23 +020011966 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11967 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011968 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 if (PyUnicode_READY(self) == -1)
11971 return NULL;
11972 if (PyUnicode_READY(substring) == -1)
11973 return NULL;
11974
Victor Stinner7931d9a2011-11-04 00:22:48 +010011975 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976
11977 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 if (result == -2)
11980 return NULL;
11981
Christian Heimes217cfd12007-12-02 14:31:20 +000011982 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983}
11984
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011985PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011988Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989
11990static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011993 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011994 Py_ssize_t start;
11995 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011996 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011997
Jesus Ceaac451502011-04-20 17:09:23 +020011998 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11999 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012000 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 if (PyUnicode_READY(self) == -1)
12003 return NULL;
12004 if (PyUnicode_READY(substring) == -1)
12005 return NULL;
12006
Victor Stinner7931d9a2011-11-04 00:22:48 +010012007 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008
12009 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 if (result == -2)
12012 return NULL;
12013
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014 if (result < 0) {
12015 PyErr_SetString(PyExc_ValueError, "substring not found");
12016 return NULL;
12017 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018
Christian Heimes217cfd12007-12-02 14:31:20 +000012019 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020}
12021
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012022PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012023 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012025Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012026done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027
12028static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012029unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012031 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012032 Py_UCS4 fillchar = ' ';
12033
Victor Stinnere9a29352011-10-01 02:14:59 +020012034 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012036
Benjamin Petersonbac79492012-01-14 13:34:47 -050012037 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038 return NULL;
12039
Victor Stinnerc4b49542011-12-11 22:44:26 +010012040 if (PyUnicode_GET_LENGTH(self) >= width)
12041 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042
Victor Stinnerc4b49542011-12-11 22:44:26 +010012043 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012044}
12045
Alexander Belopolsky40018472011-02-26 01:02:56 +000012046PyObject *
12047PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048{
12049 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012050
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051 s = PyUnicode_FromObject(s);
12052 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012053 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012054 if (sep != NULL) {
12055 sep = PyUnicode_FromObject(sep);
12056 if (sep == NULL) {
12057 Py_DECREF(s);
12058 return NULL;
12059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060 }
12061
Victor Stinner9310abb2011-10-05 00:59:23 +020012062 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012063
12064 Py_DECREF(s);
12065 Py_XDECREF(sep);
12066 return result;
12067}
12068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012069PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012070 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071\n\
12072Return a list of the words in S, using sep as the\n\
12073delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012074splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012075whitespace string is a separator and empty strings are\n\
12076removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077
12078static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012079unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012081 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012083 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012085 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12086 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087 return NULL;
12088
12089 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012090 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012092 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012093 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012094 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095}
12096
Thomas Wouters477c8d52006-05-27 19:21:47 +000012097PyObject *
12098PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12099{
12100 PyObject* str_obj;
12101 PyObject* sep_obj;
12102 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 int kind1, kind2, kind;
12104 void *buf1 = NULL, *buf2 = NULL;
12105 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012106
12107 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012108 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012109 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012110 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012111 if (!sep_obj) {
12112 Py_DECREF(str_obj);
12113 return NULL;
12114 }
12115 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12116 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012117 Py_DECREF(str_obj);
12118 return NULL;
12119 }
12120
Victor Stinner14f8f022011-10-05 20:58:25 +020012121 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012123 kind = Py_MAX(kind1, kind2);
12124 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012126 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 if (!buf1)
12128 goto onError;
12129 buf2 = PyUnicode_DATA(sep_obj);
12130 if (kind2 != kind)
12131 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12132 if (!buf2)
12133 goto onError;
12134 len1 = PyUnicode_GET_LENGTH(str_obj);
12135 len2 = PyUnicode_GET_LENGTH(sep_obj);
12136
Benjamin Petersonead6b532011-12-20 17:23:42 -060012137 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012139 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12140 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12141 else
12142 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 break;
12144 case PyUnicode_2BYTE_KIND:
12145 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12146 break;
12147 case PyUnicode_4BYTE_KIND:
12148 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12149 break;
12150 default:
12151 assert(0);
12152 out = 0;
12153 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012154
12155 Py_DECREF(sep_obj);
12156 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012157 if (kind1 != kind)
12158 PyMem_Free(buf1);
12159 if (kind2 != kind)
12160 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012161
12162 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012163 onError:
12164 Py_DECREF(sep_obj);
12165 Py_DECREF(str_obj);
12166 if (kind1 != kind && buf1)
12167 PyMem_Free(buf1);
12168 if (kind2 != kind && buf2)
12169 PyMem_Free(buf2);
12170 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012171}
12172
12173
12174PyObject *
12175PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12176{
12177 PyObject* str_obj;
12178 PyObject* sep_obj;
12179 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012180 int kind1, kind2, kind;
12181 void *buf1 = NULL, *buf2 = NULL;
12182 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012183
12184 str_obj = PyUnicode_FromObject(str_in);
12185 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012186 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012187 sep_obj = PyUnicode_FromObject(sep_in);
12188 if (!sep_obj) {
12189 Py_DECREF(str_obj);
12190 return NULL;
12191 }
12192
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 kind1 = PyUnicode_KIND(str_in);
12194 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012195 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 buf1 = PyUnicode_DATA(str_in);
12197 if (kind1 != kind)
12198 buf1 = _PyUnicode_AsKind(str_in, kind);
12199 if (!buf1)
12200 goto onError;
12201 buf2 = PyUnicode_DATA(sep_obj);
12202 if (kind2 != kind)
12203 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12204 if (!buf2)
12205 goto onError;
12206 len1 = PyUnicode_GET_LENGTH(str_obj);
12207 len2 = PyUnicode_GET_LENGTH(sep_obj);
12208
Benjamin Petersonead6b532011-12-20 17:23:42 -060012209 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012211 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12212 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12213 else
12214 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 break;
12216 case PyUnicode_2BYTE_KIND:
12217 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12218 break;
12219 case PyUnicode_4BYTE_KIND:
12220 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12221 break;
12222 default:
12223 assert(0);
12224 out = 0;
12225 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012226
12227 Py_DECREF(sep_obj);
12228 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 if (kind1 != kind)
12230 PyMem_Free(buf1);
12231 if (kind2 != kind)
12232 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012233
12234 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012235 onError:
12236 Py_DECREF(sep_obj);
12237 Py_DECREF(str_obj);
12238 if (kind1 != kind && buf1)
12239 PyMem_Free(buf1);
12240 if (kind2 != kind && buf2)
12241 PyMem_Free(buf2);
12242 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012243}
12244
12245PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012246 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012247\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012248Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012249the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012250found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012251
12252static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012253unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012254{
Victor Stinner9310abb2011-10-05 00:59:23 +020012255 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012256}
12257
12258PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012259 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012260\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012261Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012262the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012263separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012264
12265static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012266unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012267{
Victor Stinner9310abb2011-10-05 00:59:23 +020012268 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012269}
12270
Alexander Belopolsky40018472011-02-26 01:02:56 +000012271PyObject *
12272PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012273{
12274 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012275
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012276 s = PyUnicode_FromObject(s);
12277 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012278 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 if (sep != NULL) {
12280 sep = PyUnicode_FromObject(sep);
12281 if (sep == NULL) {
12282 Py_DECREF(s);
12283 return NULL;
12284 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012285 }
12286
Victor Stinner9310abb2011-10-05 00:59:23 +020012287 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012288
12289 Py_DECREF(s);
12290 Py_XDECREF(sep);
12291 return result;
12292}
12293
12294PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012295 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012296\n\
12297Return a list of the words in S, using sep as the\n\
12298delimiter string, starting at the end of the string and\n\
12299working to the front. If maxsplit is given, at most maxsplit\n\
12300splits are done. If sep is not specified, any whitespace string\n\
12301is a separator.");
12302
12303static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012304unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012305{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012306 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012307 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012308 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012309
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012310 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12311 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012312 return NULL;
12313
12314 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012315 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012316 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012317 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012318 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012319 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012320}
12321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012322PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012323 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012324\n\
12325Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012326Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012327is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012328
12329static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012330unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012332 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012333 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012334
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012335 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12336 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012337 return NULL;
12338
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012339 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012340}
12341
12342static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012343PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012344{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012345 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012346}
12347
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012348PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012349 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350\n\
12351Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012352and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012353
12354static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012355unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012357 if (PyUnicode_READY(self) == -1)
12358 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012359 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012360}
12361
Georg Brandlceee0772007-11-27 23:48:05 +000012362PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012363 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012364\n\
12365Return a translation table usable for str.translate().\n\
12366If there is only one argument, it must be a dictionary mapping Unicode\n\
12367ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012368Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012369If there are two arguments, they must be strings of equal length, and\n\
12370in the resulting dictionary, each character in x will be mapped to the\n\
12371character at the same position in y. If there is a third argument, it\n\
12372must be a string, whose characters will be mapped to None in the result.");
12373
12374static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012375unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012376{
12377 PyObject *x, *y = NULL, *z = NULL;
12378 PyObject *new = NULL, *key, *value;
12379 Py_ssize_t i = 0;
12380 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012381
Georg Brandlceee0772007-11-27 23:48:05 +000012382 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12383 return NULL;
12384 new = PyDict_New();
12385 if (!new)
12386 return NULL;
12387 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388 int x_kind, y_kind, z_kind;
12389 void *x_data, *y_data, *z_data;
12390
Georg Brandlceee0772007-11-27 23:48:05 +000012391 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012392 if (!PyUnicode_Check(x)) {
12393 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12394 "be a string if there is a second argument");
12395 goto err;
12396 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012398 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12399 "arguments must have equal length");
12400 goto err;
12401 }
12402 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012403 x_kind = PyUnicode_KIND(x);
12404 y_kind = PyUnicode_KIND(y);
12405 x_data = PyUnicode_DATA(x);
12406 y_data = PyUnicode_DATA(y);
12407 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12408 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012409 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012410 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012411 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012412 if (!value) {
12413 Py_DECREF(key);
12414 goto err;
12415 }
Georg Brandlceee0772007-11-27 23:48:05 +000012416 res = PyDict_SetItem(new, key, value);
12417 Py_DECREF(key);
12418 Py_DECREF(value);
12419 if (res < 0)
12420 goto err;
12421 }
12422 /* create entries for deleting chars in z */
12423 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424 z_kind = PyUnicode_KIND(z);
12425 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012426 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012427 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012428 if (!key)
12429 goto err;
12430 res = PyDict_SetItem(new, key, Py_None);
12431 Py_DECREF(key);
12432 if (res < 0)
12433 goto err;
12434 }
12435 }
12436 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 int kind;
12438 void *data;
12439
Georg Brandlceee0772007-11-27 23:48:05 +000012440 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012441 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012442 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12443 "to maketrans it must be a dict");
12444 goto err;
12445 }
12446 /* copy entries into the new dict, converting string keys to int keys */
12447 while (PyDict_Next(x, &i, &key, &value)) {
12448 if (PyUnicode_Check(key)) {
12449 /* convert string keys to integer keys */
12450 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012451 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012452 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12453 "table must be of length 1");
12454 goto err;
12455 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 kind = PyUnicode_KIND(key);
12457 data = PyUnicode_DATA(key);
12458 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012459 if (!newkey)
12460 goto err;
12461 res = PyDict_SetItem(new, newkey, value);
12462 Py_DECREF(newkey);
12463 if (res < 0)
12464 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012465 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012466 /* just keep integer keys */
12467 if (PyDict_SetItem(new, key, value) < 0)
12468 goto err;
12469 } else {
12470 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12471 "be strings or integers");
12472 goto err;
12473 }
12474 }
12475 }
12476 return new;
12477 err:
12478 Py_DECREF(new);
12479 return NULL;
12480}
12481
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012482PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012483 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012484\n\
12485Return a copy of the string S, where all characters have been mapped\n\
12486through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012487Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012488Unmapped characters are left untouched. Characters mapped to None\n\
12489are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490
12491static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012495}
12496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012497PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012498 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012500Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501
12502static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012503unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012504{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012505 if (PyUnicode_READY(self) == -1)
12506 return NULL;
12507 if (PyUnicode_IS_ASCII(self))
12508 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012509 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012510}
12511
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012512PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012513 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012514\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012515Pad a numeric string S with zeros on the left, to fill a field\n\
12516of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517
12518static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012519unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012521 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012522 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012523 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524 int kind;
12525 void *data;
12526 Py_UCS4 chr;
12527
Martin v. Löwis18e16552006-02-15 17:27:45 +000012528 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529 return NULL;
12530
Benjamin Petersonbac79492012-01-14 13:34:47 -050012531 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012532 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533
Victor Stinnerc4b49542011-12-11 22:44:26 +010012534 if (PyUnicode_GET_LENGTH(self) >= width)
12535 return unicode_result_unchanged(self);
12536
12537 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538
12539 u = pad(self, fill, 0, '0');
12540
Walter Dörwald068325e2002-04-15 13:36:47 +000012541 if (u == NULL)
12542 return NULL;
12543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 kind = PyUnicode_KIND(u);
12545 data = PyUnicode_DATA(u);
12546 chr = PyUnicode_READ(kind, data, fill);
12547
12548 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 PyUnicode_WRITE(kind, data, 0, chr);
12551 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552 }
12553
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012554 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012555 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557
12558#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012559static PyObject *
12560unicode__decimal2ascii(PyObject *self)
12561{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012562 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012563}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564#endif
12565
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012566PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012567 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012569Return True if S starts with the specified prefix, False otherwise.\n\
12570With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012571With optional end, stop comparing S at that position.\n\
12572prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573
12574static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012575unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012578 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012579 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012580 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012581 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012582 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583
Jesus Ceaac451502011-04-20 17:09:23 +020012584 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012585 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012586 if (PyTuple_Check(subobj)) {
12587 Py_ssize_t i;
12588 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012589 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012590 if (substring == NULL)
12591 return NULL;
12592 result = tailmatch(self, substring, start, end, -1);
12593 Py_DECREF(substring);
12594 if (result) {
12595 Py_RETURN_TRUE;
12596 }
12597 }
12598 /* nothing matched */
12599 Py_RETURN_FALSE;
12600 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012601 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012602 if (substring == NULL) {
12603 if (PyErr_ExceptionMatches(PyExc_TypeError))
12604 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12605 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012606 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012607 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012608 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012610 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611}
12612
12613
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012614PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012615 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012617Return True if S ends with the specified suffix, False otherwise.\n\
12618With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012619With optional end, stop comparing S at that position.\n\
12620suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621
12622static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012623unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012624 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012626 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012627 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012628 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012629 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012630 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631
Jesus Ceaac451502011-04-20 17:09:23 +020012632 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012633 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012634 if (PyTuple_Check(subobj)) {
12635 Py_ssize_t i;
12636 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012637 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012638 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012639 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012640 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012641 result = tailmatch(self, substring, start, end, +1);
12642 Py_DECREF(substring);
12643 if (result) {
12644 Py_RETURN_TRUE;
12645 }
12646 }
12647 Py_RETURN_FALSE;
12648 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012649 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012650 if (substring == NULL) {
12651 if (PyErr_ExceptionMatches(PyExc_TypeError))
12652 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12653 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012654 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012655 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012656 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012658 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659}
12660
Victor Stinner202fdca2012-05-07 12:47:02 +020012661Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012662_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012663{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012664 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012665 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12666 writer->data = PyUnicode_DATA(writer->buffer);
12667 writer->kind = PyUnicode_KIND(writer->buffer);
12668}
12669
Victor Stinnerd3f08822012-05-29 12:57:52 +020012670void
12671_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012672{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012673 memset(writer, 0, sizeof(*writer));
12674#ifdef Py_DEBUG
12675 writer->kind = 5; /* invalid kind */
12676#endif
12677 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012678 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012679}
12680
Victor Stinnerd3f08822012-05-29 12:57:52 +020012681int
12682_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12683 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012684{
12685 Py_ssize_t newlen;
12686 PyObject *newbuffer;
12687
Victor Stinnerd3f08822012-05-29 12:57:52 +020012688 assert(length > 0);
12689
Victor Stinner202fdca2012-05-07 12:47:02 +020012690 if (length > PY_SSIZE_T_MAX - writer->pos) {
12691 PyErr_NoMemory();
12692 return -1;
12693 }
12694 newlen = writer->pos + length;
12695
Victor Stinnerd3f08822012-05-29 12:57:52 +020012696 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012697 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012698 /* overallocate 25% to limit the number of resize */
12699 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12700 newlen += newlen / 4;
12701 if (newlen < writer->min_length)
12702 newlen = writer->min_length;
12703 }
12704 writer->buffer = PyUnicode_New(newlen, maxchar);
12705 if (writer->buffer == NULL)
12706 return -1;
12707 _PyUnicodeWriter_Update(writer);
12708 return 0;
12709 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012710
Victor Stinnerd3f08822012-05-29 12:57:52 +020012711 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012712 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012713 /* overallocate 25% to limit the number of resize */
12714 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12715 newlen += newlen / 4;
12716 if (newlen < writer->min_length)
12717 newlen = writer->min_length;
12718 }
12719
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012720 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012721 /* resize + widen */
12722 newbuffer = PyUnicode_New(newlen, maxchar);
12723 if (newbuffer == NULL)
12724 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012725 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12726 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012727 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012728 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012729 }
12730 else {
12731 newbuffer = resize_compact(writer->buffer, newlen);
12732 if (newbuffer == NULL)
12733 return -1;
12734 }
12735 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012736 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012737 }
12738 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012739 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012740 newbuffer = PyUnicode_New(writer->size, maxchar);
12741 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012742 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012743 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12744 writer->buffer, 0, writer->pos);
12745 Py_DECREF(writer->buffer);
12746 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012747 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012748 }
12749 return 0;
12750}
12751
Victor Stinnerd3f08822012-05-29 12:57:52 +020012752int
12753_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12754{
12755 Py_UCS4 maxchar;
12756 Py_ssize_t len;
12757
12758 if (PyUnicode_READY(str) == -1)
12759 return -1;
12760 len = PyUnicode_GET_LENGTH(str);
12761 if (len == 0)
12762 return 0;
12763 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12764 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012765 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012766 Py_INCREF(str);
12767 writer->buffer = str;
12768 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012769 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012770 writer->size = 0;
12771 writer->pos += len;
12772 return 0;
12773 }
12774 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12775 return -1;
12776 }
12777 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12778 str, 0, len);
12779 writer->pos += len;
12780 return 0;
12781}
12782
Victor Stinnere215d962012-10-06 23:03:36 +020012783int
12784_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
12785{
12786 Py_UCS4 maxchar;
12787
12788 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
12789 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
12790 return -1;
12791 unicode_write_cstr(writer->buffer, writer->pos, str, len);
12792 writer->pos += len;
12793 return 0;
12794}
12795
Victor Stinnerd3f08822012-05-29 12:57:52 +020012796PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012797_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012798{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012799 if (writer->pos == 0) {
12800 Py_XDECREF(writer->buffer);
12801 Py_INCREF(unicode_empty);
12802 return unicode_empty;
12803 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012804 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012805 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12806 return writer->buffer;
12807 }
12808 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12809 PyObject *newbuffer;
12810 newbuffer = resize_compact(writer->buffer, writer->pos);
12811 if (newbuffer == NULL) {
12812 Py_DECREF(writer->buffer);
12813 return NULL;
12814 }
12815 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012816 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012817 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012818 return writer->buffer;
12819}
12820
Victor Stinnerd3f08822012-05-29 12:57:52 +020012821void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012822_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012823{
12824 Py_CLEAR(writer->buffer);
12825}
12826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012827#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012828
12829PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012830 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012831\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012832Return a formatted version of S, using substitutions from args and kwargs.\n\
12833The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012834
Eric Smith27bbca62010-11-04 17:06:58 +000012835PyDoc_STRVAR(format_map__doc__,
12836 "S.format_map(mapping) -> str\n\
12837\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012838Return a formatted version of S, using substitutions from mapping.\n\
12839The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012840
Eric Smith4a7d76d2008-05-30 18:10:19 +000012841static PyObject *
12842unicode__format__(PyObject* self, PyObject* args)
12843{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012844 PyObject *format_spec;
12845 _PyUnicodeWriter writer;
12846 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012847
12848 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12849 return NULL;
12850
Victor Stinnerd3f08822012-05-29 12:57:52 +020012851 if (PyUnicode_READY(self) == -1)
12852 return NULL;
12853 _PyUnicodeWriter_Init(&writer, 0);
12854 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12855 self, format_spec, 0,
12856 PyUnicode_GET_LENGTH(format_spec));
12857 if (ret == -1) {
12858 _PyUnicodeWriter_Dealloc(&writer);
12859 return NULL;
12860 }
12861 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012862}
12863
Eric Smith8c663262007-08-25 02:26:07 +000012864PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012865 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012866\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012867Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012868
12869static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012870unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012871{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012872 Py_ssize_t size;
12873
12874 /* If it's a compact object, account for base structure +
12875 character data. */
12876 if (PyUnicode_IS_COMPACT_ASCII(v))
12877 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12878 else if (PyUnicode_IS_COMPACT(v))
12879 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012880 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012881 else {
12882 /* If it is a two-block object, account for base object, and
12883 for character block if present. */
12884 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012885 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012886 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012887 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012888 }
12889 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012890 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012891 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012893 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012894 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012895
12896 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012897}
12898
12899PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012900 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012901
12902static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012903unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012904{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010012905 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012906 if (!copy)
12907 return NULL;
12908 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012909}
12910
Guido van Rossumd57fd912000-03-10 22:53:23 +000012911static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012912 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012913 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012914 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
12915 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012916 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12917 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050012918 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012919 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12920 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12921 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12922 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12923 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012924 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012925 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12926 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12927 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012928 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012929 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12930 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12931 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012932 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012933 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012934 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012935 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012936 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12937 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12938 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12939 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12940 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12941 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12942 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12943 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12944 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12945 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12946 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12947 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12948 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12949 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012950 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012951 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012952 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012953 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012954 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012955 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012956 {"maketrans", (PyCFunction) unicode_maketrans,
12957 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012958 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012959#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012960 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012961 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012962#endif
12963
Benjamin Peterson14339b62009-01-31 16:36:08 +000012964 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012965 {NULL, NULL}
12966};
12967
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012968static PyObject *
12969unicode_mod(PyObject *v, PyObject *w)
12970{
Brian Curtindfc80e32011-08-10 20:28:54 -050012971 if (!PyUnicode_Check(v))
12972 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012973 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012974}
12975
12976static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012977 0, /*nb_add*/
12978 0, /*nb_subtract*/
12979 0, /*nb_multiply*/
12980 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012981};
12982
Guido van Rossumd57fd912000-03-10 22:53:23 +000012983static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012984 (lenfunc) unicode_length, /* sq_length */
12985 PyUnicode_Concat, /* sq_concat */
12986 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12987 (ssizeargfunc) unicode_getitem, /* sq_item */
12988 0, /* sq_slice */
12989 0, /* sq_ass_item */
12990 0, /* sq_ass_slice */
12991 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012992};
12993
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012994static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012995unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012997 if (PyUnicode_READY(self) == -1)
12998 return NULL;
12999
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013000 if (PyIndex_Check(item)) {
13001 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013002 if (i == -1 && PyErr_Occurred())
13003 return NULL;
13004 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013006 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013007 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013008 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013009 PyObject *result;
13010 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013011 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013012 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013015 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013016 return NULL;
13017 }
13018
13019 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013020 Py_INCREF(unicode_empty);
13021 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013022 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013023 slicelength == PyUnicode_GET_LENGTH(self)) {
13024 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013025 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013026 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013027 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013028 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013029 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013030 src_kind = PyUnicode_KIND(self);
13031 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013032 if (!PyUnicode_IS_ASCII(self)) {
13033 kind_limit = kind_maxchar_limit(src_kind);
13034 max_char = 0;
13035 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13036 ch = PyUnicode_READ(src_kind, src_data, cur);
13037 if (ch > max_char) {
13038 max_char = ch;
13039 if (max_char >= kind_limit)
13040 break;
13041 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013042 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013043 }
Victor Stinner55c99112011-10-13 01:17:06 +020013044 else
13045 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013046 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013047 if (result == NULL)
13048 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013049 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013050 dest_data = PyUnicode_DATA(result);
13051
13052 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013053 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13054 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013055 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013056 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013057 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013058 } else {
13059 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13060 return NULL;
13061 }
13062}
13063
13064static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013065 (lenfunc)unicode_length, /* mp_length */
13066 (binaryfunc)unicode_subscript, /* mp_subscript */
13067 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013068};
13069
Guido van Rossumd57fd912000-03-10 22:53:23 +000013070
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071/* Helpers for PyUnicode_Format() */
13072
Victor Stinnera47082312012-10-04 02:19:54 +020013073struct unicode_formatter_t {
13074 PyObject *args;
13075 int args_owned;
13076 Py_ssize_t arglen, argidx;
13077 PyObject *dict;
13078
13079 enum PyUnicode_Kind fmtkind;
13080 Py_ssize_t fmtcnt, fmtpos;
13081 void *fmtdata;
13082 PyObject *fmtstr;
13083
13084 _PyUnicodeWriter writer;
13085};
13086
13087struct unicode_format_arg_t {
13088 Py_UCS4 ch;
13089 int flags;
13090 Py_ssize_t width;
13091 int prec;
13092 int sign;
13093};
13094
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013096unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097{
Victor Stinnera47082312012-10-04 02:19:54 +020013098 Py_ssize_t argidx = ctx->argidx;
13099
13100 if (argidx < ctx->arglen) {
13101 ctx->argidx++;
13102 if (ctx->arglen < 0)
13103 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013104 else
Victor Stinnera47082312012-10-04 02:19:54 +020013105 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013106 }
13107 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013108 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109 return NULL;
13110}
13111
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013112/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113
Victor Stinnera47082312012-10-04 02:19:54 +020013114/* Format a float into the writer if the writer is not NULL, or into *p_output
13115 otherwise.
13116
13117 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013118static int
Victor Stinnera47082312012-10-04 02:19:54 +020013119formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13120 PyObject **p_output,
13121 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013123 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013125 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013126 int prec;
13127 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013128
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129 x = PyFloat_AsDouble(v);
13130 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013131 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013132
Victor Stinnera47082312012-10-04 02:19:54 +020013133 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013134 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013135 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013136
Victor Stinnera47082312012-10-04 02:19:54 +020013137 if (arg->flags & F_ALT)
13138 dtoa_flags = Py_DTSF_ALT;
13139 else
13140 dtoa_flags = 0;
13141 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013142 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013143 return -1;
13144 len = strlen(p);
13145 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013146 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13147 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013148 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013149 }
Victor Stinner184252a2012-06-16 02:57:41 +020013150 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013151 writer->pos += len;
13152 }
13153 else
13154 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013155 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013156 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157}
13158
Victor Stinnerd0880d52012-04-27 23:40:13 +020013159/* formatlong() emulates the format codes d, u, o, x and X, and
13160 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13161 * Python's regular ints.
13162 * Return value: a new PyUnicodeObject*, or NULL if error.
13163 * The output string is of the form
13164 * "-"? ("0x" | "0X")? digit+
13165 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13166 * set in flags. The case of hex digits will be correct,
13167 * There will be at least prec digits, zero-filled on the left if
13168 * necessary to get that many.
13169 * val object to be converted
13170 * flags bitmask of format flags; only F_ALT is looked at
13171 * prec minimum number of digits; 0-fill on left if needed
13172 * type a character in [duoxX]; u acts the same as d
13173 *
13174 * CAUTION: o, x and X conversions on regular ints can never
13175 * produce a '-' sign, but can for Python's unbounded ints.
13176 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013177static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013178formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013179{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013180 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013181 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013182 Py_ssize_t i;
13183 int sign; /* 1 if '-', else 0 */
13184 int len; /* number of characters */
13185 Py_ssize_t llen;
13186 int numdigits; /* len == numnondigits + numdigits */
13187 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013188 int prec = arg->prec;
13189 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013190
Victor Stinnerd0880d52012-04-27 23:40:13 +020013191 /* Avoid exceeding SSIZE_T_MAX */
13192 if (prec > INT_MAX-3) {
13193 PyErr_SetString(PyExc_OverflowError,
13194 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013195 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013196 }
13197
13198 assert(PyLong_Check(val));
13199
13200 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013201 default:
13202 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013203 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013204 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013205 case 'u':
13206 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013207 if (PyBool_Check(val))
13208 result = PyNumber_ToBase(val, 10);
13209 else
13210 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013211 break;
13212 case 'o':
13213 numnondigits = 2;
13214 result = PyNumber_ToBase(val, 8);
13215 break;
13216 case 'x':
13217 case 'X':
13218 numnondigits = 2;
13219 result = PyNumber_ToBase(val, 16);
13220 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013221 }
13222 if (!result)
13223 return NULL;
13224
13225 assert(unicode_modifiable(result));
13226 assert(PyUnicode_IS_READY(result));
13227 assert(PyUnicode_IS_ASCII(result));
13228
13229 /* To modify the string in-place, there can only be one reference. */
13230 if (Py_REFCNT(result) != 1) {
13231 PyErr_BadInternalCall();
13232 return NULL;
13233 }
13234 buf = PyUnicode_DATA(result);
13235 llen = PyUnicode_GET_LENGTH(result);
13236 if (llen > INT_MAX) {
13237 PyErr_SetString(PyExc_ValueError,
13238 "string too large in _PyBytes_FormatLong");
13239 return NULL;
13240 }
13241 len = (int)llen;
13242 sign = buf[0] == '-';
13243 numnondigits += sign;
13244 numdigits = len - numnondigits;
13245 assert(numdigits > 0);
13246
13247 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013248 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013249 (type == 'o' || type == 'x' || type == 'X'))) {
13250 assert(buf[sign] == '0');
13251 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13252 buf[sign+1] == 'o');
13253 numnondigits -= 2;
13254 buf += 2;
13255 len -= 2;
13256 if (sign)
13257 buf[0] = '-';
13258 assert(len == numnondigits + numdigits);
13259 assert(numdigits > 0);
13260 }
13261
13262 /* Fill with leading zeroes to meet minimum width. */
13263 if (prec > numdigits) {
13264 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13265 numnondigits + prec);
13266 char *b1;
13267 if (!r1) {
13268 Py_DECREF(result);
13269 return NULL;
13270 }
13271 b1 = PyBytes_AS_STRING(r1);
13272 for (i = 0; i < numnondigits; ++i)
13273 *b1++ = *buf++;
13274 for (i = 0; i < prec - numdigits; i++)
13275 *b1++ = '0';
13276 for (i = 0; i < numdigits; i++)
13277 *b1++ = *buf++;
13278 *b1 = '\0';
13279 Py_DECREF(result);
13280 result = r1;
13281 buf = PyBytes_AS_STRING(result);
13282 len = numnondigits + prec;
13283 }
13284
13285 /* Fix up case for hex conversions. */
13286 if (type == 'X') {
13287 /* Need to convert all lower case letters to upper case.
13288 and need to convert 0x to 0X (and -0x to -0X). */
13289 for (i = 0; i < len; i++)
13290 if (buf[i] >= 'a' && buf[i] <= 'x')
13291 buf[i] -= 'a'-'A';
13292 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013293 if (!PyUnicode_Check(result)
13294 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013295 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013296 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013297 Py_DECREF(result);
13298 result = unicode;
13299 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013300 else if (len != PyUnicode_GET_LENGTH(result)) {
13301 if (PyUnicode_Resize(&result, len) < 0)
13302 Py_CLEAR(result);
13303 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013304 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013305}
13306
Victor Stinner621ef3d2012-10-02 00:33:47 +020013307/* Format an integer.
13308 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013309 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013310 * -1 and raise an exception on error */
13311static int
Victor Stinnera47082312012-10-04 02:19:54 +020013312mainformatlong(PyObject *v,
13313 struct unicode_format_arg_t *arg,
13314 PyObject **p_output,
13315 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013316{
13317 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013318 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013319
13320 if (!PyNumber_Check(v))
13321 goto wrongtype;
13322
13323 if (!PyLong_Check(v)) {
13324 iobj = PyNumber_Long(v);
13325 if (iobj == NULL) {
13326 if (PyErr_ExceptionMatches(PyExc_TypeError))
13327 goto wrongtype;
13328 return -1;
13329 }
13330 assert(PyLong_Check(iobj));
13331 }
13332 else {
13333 iobj = v;
13334 Py_INCREF(iobj);
13335 }
13336
13337 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013338 && arg->width == -1 && arg->prec == -1
13339 && !(arg->flags & (F_SIGN | F_BLANK))
13340 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013341 {
13342 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013343 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013344 int base;
13345
Victor Stinnera47082312012-10-04 02:19:54 +020013346 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013347 {
13348 default:
13349 assert(0 && "'type' not in [diuoxX]");
13350 case 'd':
13351 case 'i':
13352 case 'u':
13353 base = 10;
13354 break;
13355 case 'o':
13356 base = 8;
13357 break;
13358 case 'x':
13359 case 'X':
13360 base = 16;
13361 break;
13362 }
13363
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013364 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13365 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013366 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013367 }
13368 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013369 return 1;
13370 }
13371
Victor Stinnera47082312012-10-04 02:19:54 +020013372 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013373 Py_DECREF(iobj);
13374 if (res == NULL)
13375 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013376 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013377 return 0;
13378
13379wrongtype:
13380 PyErr_Format(PyExc_TypeError,
13381 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013382 "not %.200s",
13383 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013384 return -1;
13385}
13386
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013387static Py_UCS4
13388formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013389{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013390 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013391 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013392 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013393 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013394 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013395 goto onError;
13396 }
13397 else {
13398 /* Integer input truncated to a character */
13399 long x;
13400 x = PyLong_AsLong(v);
13401 if (x == -1 && PyErr_Occurred())
13402 goto onError;
13403
Victor Stinner8faf8212011-12-08 22:14:11 +010013404 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013405 PyErr_SetString(PyExc_OverflowError,
13406 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013407 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013408 }
13409
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013410 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013411 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013412
Benjamin Peterson29060642009-01-31 22:14:21 +000013413 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013414 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013415 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013416 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013417}
13418
Victor Stinnera47082312012-10-04 02:19:54 +020013419/* Parse options of an argument: flags, width, precision.
13420 Handle also "%(name)" syntax.
13421
13422 Return 0 if the argument has been formatted into arg->str.
13423 Return 1 if the argument has been written into ctx->writer,
13424 Raise an exception and return -1 on error. */
13425static int
13426unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13427 struct unicode_format_arg_t *arg)
13428{
13429#define FORMAT_READ(ctx) \
13430 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13431
13432 PyObject *v;
13433
13434 arg->ch = FORMAT_READ(ctx);
13435 if (arg->ch == '(') {
13436 /* Get argument value from a dictionary. Example: "%(name)s". */
13437 Py_ssize_t keystart;
13438 Py_ssize_t keylen;
13439 PyObject *key;
13440 int pcount = 1;
13441
13442 if (ctx->dict == NULL) {
13443 PyErr_SetString(PyExc_TypeError,
13444 "format requires a mapping");
13445 return -1;
13446 }
13447 ++ctx->fmtpos;
13448 --ctx->fmtcnt;
13449 keystart = ctx->fmtpos;
13450 /* Skip over balanced parentheses */
13451 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13452 arg->ch = FORMAT_READ(ctx);
13453 if (arg->ch == ')')
13454 --pcount;
13455 else if (arg->ch == '(')
13456 ++pcount;
13457 ctx->fmtpos++;
13458 }
13459 keylen = ctx->fmtpos - keystart - 1;
13460 if (ctx->fmtcnt < 0 || pcount > 0) {
13461 PyErr_SetString(PyExc_ValueError,
13462 "incomplete format key");
13463 return -1;
13464 }
13465 key = PyUnicode_Substring(ctx->fmtstr,
13466 keystart, keystart + keylen);
13467 if (key == NULL)
13468 return -1;
13469 if (ctx->args_owned) {
13470 Py_DECREF(ctx->args);
13471 ctx->args_owned = 0;
13472 }
13473 ctx->args = PyObject_GetItem(ctx->dict, key);
13474 Py_DECREF(key);
13475 if (ctx->args == NULL)
13476 return -1;
13477 ctx->args_owned = 1;
13478 ctx->arglen = -1;
13479 ctx->argidx = -2;
13480 }
13481
13482 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
13483 arg->flags = 0;
13484 while (--ctx->fmtcnt >= 0) {
13485 arg->ch = FORMAT_READ(ctx);
13486 ctx->fmtpos++;
13487 switch (arg->ch) {
13488 case '-': arg->flags |= F_LJUST; continue;
13489 case '+': arg->flags |= F_SIGN; continue;
13490 case ' ': arg->flags |= F_BLANK; continue;
13491 case '#': arg->flags |= F_ALT; continue;
13492 case '0': arg->flags |= F_ZERO; continue;
13493 }
13494 break;
13495 }
13496
13497 /* Parse width. Example: "%10s" => width=10 */
13498 arg->width = -1;
13499 if (arg->ch == '*') {
13500 v = unicode_format_getnextarg(ctx);
13501 if (v == NULL)
13502 return -1;
13503 if (!PyLong_Check(v)) {
13504 PyErr_SetString(PyExc_TypeError,
13505 "* wants int");
13506 return -1;
13507 }
13508 arg->width = PyLong_AsLong(v);
13509 if (arg->width == -1 && PyErr_Occurred())
13510 return -1;
13511 if (arg->width < 0) {
13512 arg->flags |= F_LJUST;
13513 arg->width = -arg->width;
13514 }
13515 if (--ctx->fmtcnt >= 0) {
13516 arg->ch = FORMAT_READ(ctx);
13517 ctx->fmtpos++;
13518 }
13519 }
13520 else if (arg->ch >= '0' && arg->ch <= '9') {
13521 arg->width = arg->ch - '0';
13522 while (--ctx->fmtcnt >= 0) {
13523 arg->ch = FORMAT_READ(ctx);
13524 ctx->fmtpos++;
13525 if (arg->ch < '0' || arg->ch > '9')
13526 break;
13527 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13528 mixing signed and unsigned comparison. Since arg->ch is between
13529 '0' and '9', casting to int is safe. */
13530 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13531 PyErr_SetString(PyExc_ValueError,
13532 "width too big");
13533 return -1;
13534 }
13535 arg->width = arg->width*10 + (arg->ch - '0');
13536 }
13537 }
13538
13539 /* Parse precision. Example: "%.3f" => prec=3 */
13540 arg->prec = -1;
13541 if (arg->ch == '.') {
13542 arg->prec = 0;
13543 if (--ctx->fmtcnt >= 0) {
13544 arg->ch = FORMAT_READ(ctx);
13545 ctx->fmtpos++;
13546 }
13547 if (arg->ch == '*') {
13548 v = unicode_format_getnextarg(ctx);
13549 if (v == NULL)
13550 return -1;
13551 if (!PyLong_Check(v)) {
13552 PyErr_SetString(PyExc_TypeError,
13553 "* wants int");
13554 return -1;
13555 }
13556 arg->prec = PyLong_AsLong(v);
13557 if (arg->prec == -1 && PyErr_Occurred())
13558 return -1;
13559 if (arg->prec < 0)
13560 arg->prec = 0;
13561 if (--ctx->fmtcnt >= 0) {
13562 arg->ch = FORMAT_READ(ctx);
13563 ctx->fmtpos++;
13564 }
13565 }
13566 else if (arg->ch >= '0' && arg->ch <= '9') {
13567 arg->prec = arg->ch - '0';
13568 while (--ctx->fmtcnt >= 0) {
13569 arg->ch = FORMAT_READ(ctx);
13570 ctx->fmtpos++;
13571 if (arg->ch < '0' || arg->ch > '9')
13572 break;
13573 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13574 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013575 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013576 return -1;
13577 }
13578 arg->prec = arg->prec*10 + (arg->ch - '0');
13579 }
13580 }
13581 }
13582
13583 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13584 if (ctx->fmtcnt >= 0) {
13585 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13586 if (--ctx->fmtcnt >= 0) {
13587 arg->ch = FORMAT_READ(ctx);
13588 ctx->fmtpos++;
13589 }
13590 }
13591 }
13592 if (ctx->fmtcnt < 0) {
13593 PyErr_SetString(PyExc_ValueError,
13594 "incomplete format");
13595 return -1;
13596 }
13597 return 0;
13598
13599#undef FORMAT_READ
13600}
13601
13602/* Format one argument. Supported conversion specifiers:
13603
13604 - "s", "r", "a": any type
13605 - "i", "d", "u", "o", "x", "X": int
13606 - "e", "E", "f", "F", "g", "G": float
13607 - "c": int or str (1 character)
13608
13609 Return 0 if the argument has been formatted into *p_str,
13610 1 if the argument has been written into ctx->writer,
13611 -1 on error. */
13612static int
13613unicode_format_arg_format(struct unicode_formatter_t *ctx,
13614 struct unicode_format_arg_t *arg,
13615 PyObject **p_str)
13616{
13617 PyObject *v;
13618 _PyUnicodeWriter *writer = &ctx->writer;
13619
13620 if (ctx->fmtcnt == 0)
13621 ctx->writer.overallocate = 0;
13622
13623 if (arg->ch == '%') {
13624 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1)
13625 return -1;
13626 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
13627 writer->pos += 1;
13628 return 1;
13629 }
13630
13631 v = unicode_format_getnextarg(ctx);
13632 if (v == NULL)
13633 return -1;
13634
13635 arg->sign = 0;
13636
13637 switch (arg->ch) {
13638
13639 case 's':
13640 case 'r':
13641 case 'a':
13642 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13643 /* Fast path */
13644 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13645 return -1;
13646 return 1;
13647 }
13648
13649 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13650 *p_str = v;
13651 Py_INCREF(*p_str);
13652 }
13653 else {
13654 if (arg->ch == 's')
13655 *p_str = PyObject_Str(v);
13656 else if (arg->ch == 'r')
13657 *p_str = PyObject_Repr(v);
13658 else
13659 *p_str = PyObject_ASCII(v);
13660 }
13661 break;
13662
13663 case 'i':
13664 case 'd':
13665 case 'u':
13666 case 'o':
13667 case 'x':
13668 case 'X':
13669 {
13670 int ret = mainformatlong(v, arg, p_str, writer);
13671 if (ret != 0)
13672 return ret;
13673 arg->sign = 1;
13674 break;
13675 }
13676
13677 case 'e':
13678 case 'E':
13679 case 'f':
13680 case 'F':
13681 case 'g':
13682 case 'G':
13683 if (arg->width == -1 && arg->prec == -1
13684 && !(arg->flags & (F_SIGN | F_BLANK)))
13685 {
13686 /* Fast path */
13687 if (formatfloat(v, arg, NULL, writer) == -1)
13688 return -1;
13689 return 1;
13690 }
13691
13692 arg->sign = 1;
13693 if (formatfloat(v, arg, p_str, NULL) == -1)
13694 return -1;
13695 break;
13696
13697 case 'c':
13698 {
13699 Py_UCS4 ch = formatchar(v);
13700 if (ch == (Py_UCS4) -1)
13701 return -1;
13702 if (arg->width == -1 && arg->prec == -1) {
13703 /* Fast path */
13704 if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1)
13705 return -1;
13706 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13707 writer->pos += 1;
13708 return 1;
13709 }
13710 *p_str = PyUnicode_FromOrdinal(ch);
13711 break;
13712 }
13713
13714 default:
13715 PyErr_Format(PyExc_ValueError,
13716 "unsupported format character '%c' (0x%x) "
13717 "at index %zd",
13718 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13719 (int)arg->ch,
13720 ctx->fmtpos - 1);
13721 return -1;
13722 }
13723 if (*p_str == NULL)
13724 return -1;
13725 assert (PyUnicode_Check(*p_str));
13726 return 0;
13727}
13728
13729static int
13730unicode_format_arg_output(struct unicode_formatter_t *ctx,
13731 struct unicode_format_arg_t *arg,
13732 PyObject *str)
13733{
13734 Py_ssize_t len;
13735 enum PyUnicode_Kind kind;
13736 void *pbuf;
13737 Py_ssize_t pindex;
13738 Py_UCS4 signchar;
13739 Py_ssize_t buflen;
13740 Py_UCS4 maxchar, bufmaxchar;
13741 Py_ssize_t sublen;
13742 _PyUnicodeWriter *writer = &ctx->writer;
13743 Py_UCS4 fill;
13744
13745 fill = ' ';
13746 if (arg->sign && arg->flags & F_ZERO)
13747 fill = '0';
13748
13749 if (PyUnicode_READY(str) == -1)
13750 return -1;
13751
13752 len = PyUnicode_GET_LENGTH(str);
13753 if ((arg->width == -1 || arg->width <= len)
13754 && (arg->prec == -1 || arg->prec >= len)
13755 && !(arg->flags & (F_SIGN | F_BLANK)))
13756 {
13757 /* Fast path */
13758 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13759 return -1;
13760 return 0;
13761 }
13762
13763 /* Truncate the string for "s", "r" and "a" formats
13764 if the precision is set */
13765 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13766 if (arg->prec >= 0 && len > arg->prec)
13767 len = arg->prec;
13768 }
13769
13770 /* Adjust sign and width */
13771 kind = PyUnicode_KIND(str);
13772 pbuf = PyUnicode_DATA(str);
13773 pindex = 0;
13774 signchar = '\0';
13775 if (arg->sign) {
13776 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13777 if (ch == '-' || ch == '+') {
13778 signchar = ch;
13779 len--;
13780 pindex++;
13781 }
13782 else if (arg->flags & F_SIGN)
13783 signchar = '+';
13784 else if (arg->flags & F_BLANK)
13785 signchar = ' ';
13786 else
13787 arg->sign = 0;
13788 }
13789 if (arg->width < len)
13790 arg->width = len;
13791
13792 /* Prepare the writer */
13793 bufmaxchar = 127;
13794 if (!(arg->flags & F_LJUST)) {
13795 if (arg->sign) {
13796 if ((arg->width-1) > len)
13797 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13798 }
13799 else {
13800 if (arg->width > len)
13801 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13802 }
13803 }
13804 maxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
13805 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
13806 buflen = arg->width;
13807 if (arg->sign && len == arg->width)
13808 buflen++;
13809 if (_PyUnicodeWriter_Prepare(writer, buflen, bufmaxchar) == -1)
13810 return -1;
13811
13812 /* Write the sign if needed */
13813 if (arg->sign) {
13814 if (fill != ' ') {
13815 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13816 writer->pos += 1;
13817 }
13818 if (arg->width > len)
13819 arg->width--;
13820 }
13821
13822 /* Write the numeric prefix for "x", "X" and "o" formats
13823 if the alternate form is used.
13824 For example, write "0x" for the "%#x" format. */
13825 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13826 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13827 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
13828 if (fill != ' ') {
13829 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13830 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13831 writer->pos += 2;
13832 pindex += 2;
13833 }
13834 arg->width -= 2;
13835 if (arg->width < 0)
13836 arg->width = 0;
13837 len -= 2;
13838 }
13839
13840 /* Pad left with the fill character if needed */
13841 if (arg->width > len && !(arg->flags & F_LJUST)) {
13842 sublen = arg->width - len;
13843 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
13844 writer->pos += sublen;
13845 arg->width = len;
13846 }
13847
13848 /* If padding with spaces: write sign if needed and/or numeric prefix if
13849 the alternate form is used */
13850 if (fill == ' ') {
13851 if (arg->sign) {
13852 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13853 writer->pos += 1;
13854 }
13855 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13856 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13857 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
13858 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13859 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13860 writer->pos += 2;
13861 pindex += 2;
13862 }
13863 }
13864
13865 /* Write characters */
13866 if (len) {
13867 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13868 str, pindex, len);
13869 writer->pos += len;
13870 }
13871
13872 /* Pad right with the fill character if needed */
13873 if (arg->width > len) {
13874 sublen = arg->width - len;
13875 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
13876 writer->pos += sublen;
13877 }
13878 return 0;
13879}
13880
13881/* Helper of PyUnicode_Format(): format one arg.
13882 Return 0 on success, raise an exception and return -1 on error. */
13883static int
13884unicode_format_arg(struct unicode_formatter_t *ctx)
13885{
13886 struct unicode_format_arg_t arg;
13887 PyObject *str;
13888 int ret;
13889
13890 ret = unicode_format_arg_parse(ctx, &arg);
13891 if (ret == -1)
13892 return -1;
13893
13894 ret = unicode_format_arg_format(ctx, &arg, &str);
13895 if (ret == -1)
13896 return -1;
13897
13898 if (ret != 1) {
13899 ret = unicode_format_arg_output(ctx, &arg, str);
13900 Py_DECREF(str);
13901 if (ret == -1)
13902 return -1;
13903 }
13904
13905 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
13906 PyErr_SetString(PyExc_TypeError,
13907 "not all arguments converted during string formatting");
13908 return -1;
13909 }
13910 return 0;
13911}
13912
Alexander Belopolsky40018472011-02-26 01:02:56 +000013913PyObject *
13914PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013915{
Victor Stinnera47082312012-10-04 02:19:54 +020013916 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000013917
Guido van Rossumd57fd912000-03-10 22:53:23 +000013918 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013919 PyErr_BadInternalCall();
13920 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013921 }
Victor Stinnera47082312012-10-04 02:19:54 +020013922
13923 ctx.fmtstr = PyUnicode_FromObject(format);
13924 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013925 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020013926 if (PyUnicode_READY(ctx.fmtstr) == -1) {
13927 Py_DECREF(ctx.fmtstr);
13928 return NULL;
13929 }
13930 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
13931 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
13932 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
13933 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013934
Victor Stinnera47082312012-10-04 02:19:54 +020013935 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013936
Guido van Rossumd57fd912000-03-10 22:53:23 +000013937 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020013938 ctx.arglen = PyTuple_Size(args);
13939 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013940 }
13941 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013942 ctx.arglen = -1;
13943 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013944 }
Victor Stinnera47082312012-10-04 02:19:54 +020013945 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013946 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020013947 ctx.dict = args;
13948 else
13949 ctx.dict = NULL;
13950 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013951
Victor Stinnera47082312012-10-04 02:19:54 +020013952 while (--ctx.fmtcnt >= 0) {
13953 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13954 Py_ssize_t nonfmtpos, sublen;
13955 Py_UCS4 maxchar;
13956
13957 nonfmtpos = ctx.fmtpos++;
13958 while (ctx.fmtcnt >= 0 &&
13959 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13960 ctx.fmtpos++;
13961 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013962 }
Victor Stinnera47082312012-10-04 02:19:54 +020013963 if (ctx.fmtcnt < 0) {
13964 ctx.fmtpos--;
13965 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020013966 }
Victor Stinnera47082312012-10-04 02:19:54 +020013967 sublen = ctx.fmtpos - nonfmtpos;
13968 maxchar = _PyUnicode_FindMaxChar(ctx.fmtstr,
Victor Stinneree4544c2012-05-09 22:24:08 +020013969 nonfmtpos, nonfmtpos + sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020013970 if (_PyUnicodeWriter_Prepare(&ctx.writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013971 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013972
Victor Stinnera47082312012-10-04 02:19:54 +020013973 _PyUnicode_FastCopyCharacters(ctx.writer.buffer, ctx.writer.pos,
13974 ctx.fmtstr, nonfmtpos, sublen);
13975 ctx.writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013976 }
13977 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013978 ctx.fmtpos++;
13979 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013980 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020013981 }
13982 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013983
Victor Stinnera47082312012-10-04 02:19:54 +020013984 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013985 PyErr_SetString(PyExc_TypeError,
13986 "not all arguments converted during string formatting");
13987 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013988 }
13989
Victor Stinnera47082312012-10-04 02:19:54 +020013990 if (ctx.args_owned) {
13991 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013992 }
Victor Stinnera47082312012-10-04 02:19:54 +020013993 Py_DECREF(ctx.fmtstr);
13994 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013995
Benjamin Peterson29060642009-01-31 22:14:21 +000013996 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020013997 Py_DECREF(ctx.fmtstr);
13998 _PyUnicodeWriter_Dealloc(&ctx.writer);
13999 if (ctx.args_owned) {
14000 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014001 }
14002 return NULL;
14003}
14004
Jeremy Hylton938ace62002-07-17 16:30:39 +000014005static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014006unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14007
Tim Peters6d6c1a32001-08-02 04:15:00 +000014008static PyObject *
14009unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14010{
Benjamin Peterson29060642009-01-31 22:14:21 +000014011 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014012 static char *kwlist[] = {"object", "encoding", "errors", 0};
14013 char *encoding = NULL;
14014 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014015
Benjamin Peterson14339b62009-01-31 16:36:08 +000014016 if (type != &PyUnicode_Type)
14017 return unicode_subtype_new(type, args, kwds);
14018 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014019 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014020 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014021 if (x == NULL) {
14022 Py_INCREF(unicode_empty);
14023 return unicode_empty;
14024 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014025 if (encoding == NULL && errors == NULL)
14026 return PyObject_Str(x);
14027 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014028 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014029}
14030
Guido van Rossume023fe02001-08-30 03:12:59 +000014031static PyObject *
14032unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14033{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014034 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014035 Py_ssize_t length, char_size;
14036 int share_wstr, share_utf8;
14037 unsigned int kind;
14038 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014039
Benjamin Peterson14339b62009-01-31 16:36:08 +000014040 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014041
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014042 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014043 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014044 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014045 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014046 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014047 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014048 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014049 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014050
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014051 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014052 if (self == NULL) {
14053 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014054 return NULL;
14055 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014056 kind = PyUnicode_KIND(unicode);
14057 length = PyUnicode_GET_LENGTH(unicode);
14058
14059 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014060#ifdef Py_DEBUG
14061 _PyUnicode_HASH(self) = -1;
14062#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014063 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014064#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014065 _PyUnicode_STATE(self).interned = 0;
14066 _PyUnicode_STATE(self).kind = kind;
14067 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014068 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014069 _PyUnicode_STATE(self).ready = 1;
14070 _PyUnicode_WSTR(self) = NULL;
14071 _PyUnicode_UTF8_LENGTH(self) = 0;
14072 _PyUnicode_UTF8(self) = NULL;
14073 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014074 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014075
14076 share_utf8 = 0;
14077 share_wstr = 0;
14078 if (kind == PyUnicode_1BYTE_KIND) {
14079 char_size = 1;
14080 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14081 share_utf8 = 1;
14082 }
14083 else if (kind == PyUnicode_2BYTE_KIND) {
14084 char_size = 2;
14085 if (sizeof(wchar_t) == 2)
14086 share_wstr = 1;
14087 }
14088 else {
14089 assert(kind == PyUnicode_4BYTE_KIND);
14090 char_size = 4;
14091 if (sizeof(wchar_t) == 4)
14092 share_wstr = 1;
14093 }
14094
14095 /* Ensure we won't overflow the length. */
14096 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14097 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014098 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014099 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014100 data = PyObject_MALLOC((length + 1) * char_size);
14101 if (data == NULL) {
14102 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014103 goto onError;
14104 }
14105
Victor Stinnerc3c74152011-10-02 20:39:55 +020014106 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014107 if (share_utf8) {
14108 _PyUnicode_UTF8_LENGTH(self) = length;
14109 _PyUnicode_UTF8(self) = data;
14110 }
14111 if (share_wstr) {
14112 _PyUnicode_WSTR_LENGTH(self) = length;
14113 _PyUnicode_WSTR(self) = (wchar_t *)data;
14114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014115
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014116 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014117 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014118 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014119#ifdef Py_DEBUG
14120 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14121#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014122 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014123 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014124
14125onError:
14126 Py_DECREF(unicode);
14127 Py_DECREF(self);
14128 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014129}
14130
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014131PyDoc_STRVAR(unicode_doc,
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014132 "str(object[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014133\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014134Create a new string object from the given object. If encoding or\n\
14135errors is specified, then the object must expose a data buffer\n\
14136that will be decoded using the given encoding and error handler.\n\
14137Otherwise, returns the result of object.__str__() (if defined)\n\
14138or repr(object).\n\
14139encoding defaults to sys.getdefaultencoding().\n\
14140errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014141
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014142static PyObject *unicode_iter(PyObject *seq);
14143
Guido van Rossumd57fd912000-03-10 22:53:23 +000014144PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014145 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014146 "str", /* tp_name */
14147 sizeof(PyUnicodeObject), /* tp_size */
14148 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014149 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014150 (destructor)unicode_dealloc, /* tp_dealloc */
14151 0, /* tp_print */
14152 0, /* tp_getattr */
14153 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014154 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014155 unicode_repr, /* tp_repr */
14156 &unicode_as_number, /* tp_as_number */
14157 &unicode_as_sequence, /* tp_as_sequence */
14158 &unicode_as_mapping, /* tp_as_mapping */
14159 (hashfunc) unicode_hash, /* tp_hash*/
14160 0, /* tp_call*/
14161 (reprfunc) unicode_str, /* tp_str */
14162 PyObject_GenericGetAttr, /* tp_getattro */
14163 0, /* tp_setattro */
14164 0, /* tp_as_buffer */
14165 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014166 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014167 unicode_doc, /* tp_doc */
14168 0, /* tp_traverse */
14169 0, /* tp_clear */
14170 PyUnicode_RichCompare, /* tp_richcompare */
14171 0, /* tp_weaklistoffset */
14172 unicode_iter, /* tp_iter */
14173 0, /* tp_iternext */
14174 unicode_methods, /* tp_methods */
14175 0, /* tp_members */
14176 0, /* tp_getset */
14177 &PyBaseObject_Type, /* tp_base */
14178 0, /* tp_dict */
14179 0, /* tp_descr_get */
14180 0, /* tp_descr_set */
14181 0, /* tp_dictoffset */
14182 0, /* tp_init */
14183 0, /* tp_alloc */
14184 unicode_new, /* tp_new */
14185 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014186};
14187
14188/* Initialize the Unicode implementation */
14189
Victor Stinner3a50e702011-10-18 21:21:00 +020014190int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014191{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014192 int i;
14193
Thomas Wouters477c8d52006-05-27 19:21:47 +000014194 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014195 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014196 0x000A, /* LINE FEED */
14197 0x000D, /* CARRIAGE RETURN */
14198 0x001C, /* FILE SEPARATOR */
14199 0x001D, /* GROUP SEPARATOR */
14200 0x001E, /* RECORD SEPARATOR */
14201 0x0085, /* NEXT LINE */
14202 0x2028, /* LINE SEPARATOR */
14203 0x2029, /* PARAGRAPH SEPARATOR */
14204 };
14205
Fred Drakee4315f52000-05-09 19:53:39 +000014206 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014207 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014208 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014209 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014210 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014211
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014212 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014213 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014214 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014215 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014216
14217 /* initialize the linebreak bloom filter */
14218 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014219 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014220 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014221
14222 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014223
14224#ifdef HAVE_MBCS
14225 winver.dwOSVersionInfoSize = sizeof(winver);
14226 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14227 PyErr_SetFromWindowsErr(0);
14228 return -1;
14229 }
14230#endif
14231 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014232}
14233
14234/* Finalize the Unicode implementation */
14235
Christian Heimesa156e092008-02-16 07:38:31 +000014236int
14237PyUnicode_ClearFreeList(void)
14238{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014239 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014240}
14241
Guido van Rossumd57fd912000-03-10 22:53:23 +000014242void
Thomas Wouters78890102000-07-22 19:25:51 +000014243_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014244{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014245 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014246
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014247 Py_XDECREF(unicode_empty);
14248 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014249
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014250 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014251 if (unicode_latin1[i]) {
14252 Py_DECREF(unicode_latin1[i]);
14253 unicode_latin1[i] = NULL;
14254 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014255 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014256 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014257 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014258}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014259
Walter Dörwald16807132007-05-25 13:52:07 +000014260void
14261PyUnicode_InternInPlace(PyObject **p)
14262{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014263 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014264 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014265#ifdef Py_DEBUG
14266 assert(s != NULL);
14267 assert(_PyUnicode_CHECK(s));
14268#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014269 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014270 return;
14271#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014272 /* If it's a subclass, we don't really know what putting
14273 it in the interned dict might do. */
14274 if (!PyUnicode_CheckExact(s))
14275 return;
14276 if (PyUnicode_CHECK_INTERNED(s))
14277 return;
14278 if (interned == NULL) {
14279 interned = PyDict_New();
14280 if (interned == NULL) {
14281 PyErr_Clear(); /* Don't leave an exception */
14282 return;
14283 }
14284 }
14285 /* It might be that the GetItem call fails even
14286 though the key is present in the dictionary,
14287 namely when this happens during a stack overflow. */
14288 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014289 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014290 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014291
Benjamin Peterson29060642009-01-31 22:14:21 +000014292 if (t) {
14293 Py_INCREF(t);
14294 Py_DECREF(*p);
14295 *p = t;
14296 return;
14297 }
Walter Dörwald16807132007-05-25 13:52:07 +000014298
Benjamin Peterson14339b62009-01-31 16:36:08 +000014299 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014300 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014301 PyErr_Clear();
14302 PyThreadState_GET()->recursion_critical = 0;
14303 return;
14304 }
14305 PyThreadState_GET()->recursion_critical = 0;
14306 /* The two references in interned are not counted by refcnt.
14307 The deallocator will take care of this */
14308 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014309 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014310}
14311
14312void
14313PyUnicode_InternImmortal(PyObject **p)
14314{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014315 PyUnicode_InternInPlace(p);
14316 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014317 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014318 Py_INCREF(*p);
14319 }
Walter Dörwald16807132007-05-25 13:52:07 +000014320}
14321
14322PyObject *
14323PyUnicode_InternFromString(const char *cp)
14324{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014325 PyObject *s = PyUnicode_FromString(cp);
14326 if (s == NULL)
14327 return NULL;
14328 PyUnicode_InternInPlace(&s);
14329 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014330}
14331
Alexander Belopolsky40018472011-02-26 01:02:56 +000014332void
14333_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014334{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014335 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014336 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014337 Py_ssize_t i, n;
14338 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014339
Benjamin Peterson14339b62009-01-31 16:36:08 +000014340 if (interned == NULL || !PyDict_Check(interned))
14341 return;
14342 keys = PyDict_Keys(interned);
14343 if (keys == NULL || !PyList_Check(keys)) {
14344 PyErr_Clear();
14345 return;
14346 }
Walter Dörwald16807132007-05-25 13:52:07 +000014347
Benjamin Peterson14339b62009-01-31 16:36:08 +000014348 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14349 detector, interned unicode strings are not forcibly deallocated;
14350 rather, we give them their stolen references back, and then clear
14351 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014352
Benjamin Peterson14339b62009-01-31 16:36:08 +000014353 n = PyList_GET_SIZE(keys);
14354 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014355 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014356 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014357 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014358 if (PyUnicode_READY(s) == -1) {
14359 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014360 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014362 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014363 case SSTATE_NOT_INTERNED:
14364 /* XXX Shouldn't happen */
14365 break;
14366 case SSTATE_INTERNED_IMMORTAL:
14367 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014368 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014369 break;
14370 case SSTATE_INTERNED_MORTAL:
14371 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014372 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014373 break;
14374 default:
14375 Py_FatalError("Inconsistent interned string state.");
14376 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014377 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014378 }
14379 fprintf(stderr, "total size of all interned strings: "
14380 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14381 "mortal/immortal\n", mortal_size, immortal_size);
14382 Py_DECREF(keys);
14383 PyDict_Clear(interned);
14384 Py_DECREF(interned);
14385 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014386}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014387
14388
14389/********************* Unicode Iterator **************************/
14390
14391typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014392 PyObject_HEAD
14393 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014394 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014395} unicodeiterobject;
14396
14397static void
14398unicodeiter_dealloc(unicodeiterobject *it)
14399{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014400 _PyObject_GC_UNTRACK(it);
14401 Py_XDECREF(it->it_seq);
14402 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014403}
14404
14405static int
14406unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14407{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014408 Py_VISIT(it->it_seq);
14409 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014410}
14411
14412static PyObject *
14413unicodeiter_next(unicodeiterobject *it)
14414{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014415 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014416
Benjamin Peterson14339b62009-01-31 16:36:08 +000014417 assert(it != NULL);
14418 seq = it->it_seq;
14419 if (seq == NULL)
14420 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014421 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014423 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14424 int kind = PyUnicode_KIND(seq);
14425 void *data = PyUnicode_DATA(seq);
14426 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14427 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014428 if (item != NULL)
14429 ++it->it_index;
14430 return item;
14431 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014432
Benjamin Peterson14339b62009-01-31 16:36:08 +000014433 Py_DECREF(seq);
14434 it->it_seq = NULL;
14435 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014436}
14437
14438static PyObject *
14439unicodeiter_len(unicodeiterobject *it)
14440{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014441 Py_ssize_t len = 0;
14442 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014443 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014444 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014445}
14446
14447PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14448
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014449static PyObject *
14450unicodeiter_reduce(unicodeiterobject *it)
14451{
14452 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014453 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014454 it->it_seq, it->it_index);
14455 } else {
14456 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14457 if (u == NULL)
14458 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014459 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014460 }
14461}
14462
14463PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14464
14465static PyObject *
14466unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14467{
14468 Py_ssize_t index = PyLong_AsSsize_t(state);
14469 if (index == -1 && PyErr_Occurred())
14470 return NULL;
14471 if (index < 0)
14472 index = 0;
14473 it->it_index = index;
14474 Py_RETURN_NONE;
14475}
14476
14477PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14478
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014479static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014480 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014481 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014482 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14483 reduce_doc},
14484 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14485 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014486 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014487};
14488
14489PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014490 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14491 "str_iterator", /* tp_name */
14492 sizeof(unicodeiterobject), /* tp_basicsize */
14493 0, /* tp_itemsize */
14494 /* methods */
14495 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14496 0, /* tp_print */
14497 0, /* tp_getattr */
14498 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014499 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014500 0, /* tp_repr */
14501 0, /* tp_as_number */
14502 0, /* tp_as_sequence */
14503 0, /* tp_as_mapping */
14504 0, /* tp_hash */
14505 0, /* tp_call */
14506 0, /* tp_str */
14507 PyObject_GenericGetAttr, /* tp_getattro */
14508 0, /* tp_setattro */
14509 0, /* tp_as_buffer */
14510 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14511 0, /* tp_doc */
14512 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14513 0, /* tp_clear */
14514 0, /* tp_richcompare */
14515 0, /* tp_weaklistoffset */
14516 PyObject_SelfIter, /* tp_iter */
14517 (iternextfunc)unicodeiter_next, /* tp_iternext */
14518 unicodeiter_methods, /* tp_methods */
14519 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014520};
14521
14522static PyObject *
14523unicode_iter(PyObject *seq)
14524{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014525 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014526
Benjamin Peterson14339b62009-01-31 16:36:08 +000014527 if (!PyUnicode_Check(seq)) {
14528 PyErr_BadInternalCall();
14529 return NULL;
14530 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014531 if (PyUnicode_READY(seq) == -1)
14532 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014533 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14534 if (it == NULL)
14535 return NULL;
14536 it->it_index = 0;
14537 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014538 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014539 _PyObject_GC_TRACK(it);
14540 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014541}
14542
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014543
14544size_t
14545Py_UNICODE_strlen(const Py_UNICODE *u)
14546{
14547 int res = 0;
14548 while(*u++)
14549 res++;
14550 return res;
14551}
14552
14553Py_UNICODE*
14554Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14555{
14556 Py_UNICODE *u = s1;
14557 while ((*u++ = *s2++));
14558 return s1;
14559}
14560
14561Py_UNICODE*
14562Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14563{
14564 Py_UNICODE *u = s1;
14565 while ((*u++ = *s2++))
14566 if (n-- == 0)
14567 break;
14568 return s1;
14569}
14570
14571Py_UNICODE*
14572Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14573{
14574 Py_UNICODE *u1 = s1;
14575 u1 += Py_UNICODE_strlen(u1);
14576 Py_UNICODE_strcpy(u1, s2);
14577 return s1;
14578}
14579
14580int
14581Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14582{
14583 while (*s1 && *s2 && *s1 == *s2)
14584 s1++, s2++;
14585 if (*s1 && *s2)
14586 return (*s1 < *s2) ? -1 : +1;
14587 if (*s1)
14588 return 1;
14589 if (*s2)
14590 return -1;
14591 return 0;
14592}
14593
14594int
14595Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14596{
14597 register Py_UNICODE u1, u2;
14598 for (; n != 0; n--) {
14599 u1 = *s1;
14600 u2 = *s2;
14601 if (u1 != u2)
14602 return (u1 < u2) ? -1 : +1;
14603 if (u1 == '\0')
14604 return 0;
14605 s1++;
14606 s2++;
14607 }
14608 return 0;
14609}
14610
14611Py_UNICODE*
14612Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14613{
14614 const Py_UNICODE *p;
14615 for (p = s; *p; p++)
14616 if (*p == c)
14617 return (Py_UNICODE*)p;
14618 return NULL;
14619}
14620
14621Py_UNICODE*
14622Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14623{
14624 const Py_UNICODE *p;
14625 p = s + Py_UNICODE_strlen(s);
14626 while (p != s) {
14627 p--;
14628 if (*p == c)
14629 return (Py_UNICODE*)p;
14630 }
14631 return NULL;
14632}
Victor Stinner331ea922010-08-10 16:37:20 +000014633
Victor Stinner71133ff2010-09-01 23:43:53 +000014634Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014635PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014636{
Victor Stinner577db2c2011-10-11 22:12:48 +020014637 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014638 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014640 if (!PyUnicode_Check(unicode)) {
14641 PyErr_BadArgument();
14642 return NULL;
14643 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014644 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014645 if (u == NULL)
14646 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014647 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014648 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014649 PyErr_NoMemory();
14650 return NULL;
14651 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014652 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014653 size *= sizeof(Py_UNICODE);
14654 copy = PyMem_Malloc(size);
14655 if (copy == NULL) {
14656 PyErr_NoMemory();
14657 return NULL;
14658 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014659 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014660 return copy;
14661}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014662
Georg Brandl66c221e2010-10-14 07:04:07 +000014663/* A _string module, to export formatter_parser and formatter_field_name_split
14664 to the string.Formatter class implemented in Python. */
14665
14666static PyMethodDef _string_methods[] = {
14667 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14668 METH_O, PyDoc_STR("split the argument as a field name")},
14669 {"formatter_parser", (PyCFunction) formatter_parser,
14670 METH_O, PyDoc_STR("parse the argument as a format string")},
14671 {NULL, NULL}
14672};
14673
14674static struct PyModuleDef _string_module = {
14675 PyModuleDef_HEAD_INIT,
14676 "_string",
14677 PyDoc_STR("string helper module"),
14678 0,
14679 _string_methods,
14680 NULL,
14681 NULL,
14682 NULL,
14683 NULL
14684};
14685
14686PyMODINIT_FUNC
14687PyInit__string(void)
14688{
14689 return PyModule_Create(&_string_module);
14690}
14691
14692
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014693#ifdef __cplusplus
14694}
14695#endif