blob: b4c7ecf9002daac50b6a33f926d24a922ecf3ba1 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinnere6abb482012-05-02 01:15:40 +0200114/* Optimized version of Py_MAX() to compute the maximum character:
115 use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2) \
117 ((maxchar1) | (maxchar2))
118
Victor Stinner910337b2011-10-03 03:20:16 +0200119#undef PyUnicode_READY
120#define PyUnicode_READY(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200123 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100124 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200162 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
182static PyObject *interned;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200185static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000186
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* Single character Unicode strings in the Latin-1 range are being
191 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200192static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193
Christian Heimes190d79e2008-01-30 11:58:22 +0000194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000199/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000200/* case 0x000C: * FORM FEED */
201/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 1, 1, 1, 1, 1, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x001C: * FILE SEPARATOR */
205/* case 0x001D: * GROUP SEPARATOR */
206/* case 0x001E: * RECORD SEPARATOR */
207/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 1, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000214
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000223};
224
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200225/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200227static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100228static int unicode_modifiable(PyObject *unicode);
229
Victor Stinnerfe226c02011-10-03 03:52:20 +0200230
Alexander Belopolsky40018472011-02-26 01:02:56 +0000231static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100241 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
Alexander Belopolsky40018472011-02-26 01:02:56 +0000244static void
245raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300246 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100247 PyObject *unicode,
248 Py_ssize_t startpos, Py_ssize_t endpos,
249 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000250
Christian Heimes190d79e2008-01-30 11:58:22 +0000251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000255/* 0x000B, * LINE TABULATION */
256/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000257/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000258 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x001C, * FILE SEPARATOR */
261/* 0x001D, * GROUP SEPARATOR */
262/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 1, 1, 1, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000277};
278
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000282PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000283{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000284#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 /* This is actually an illegal character, so it should
288 not be passed to unichr. */
289 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290#endif
291}
292
Victor Stinner910337b2011-10-03 03:20:16 +0200293#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200294int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200296{
297 PyASCIIObject *ascii;
298 unsigned int kind;
299
300 assert(PyUnicode_Check(op));
301
302 ascii = (PyASCIIObject *)op;
303 kind = ascii->state.kind;
304
Victor Stinnera3b334d2011-10-03 13:53:37 +0200305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200306 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(ascii->state.ready == 1);
308 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200309 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200311 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200312
Victor Stinnera41463c2011-10-04 01:05:08 +0200313 if (ascii->state.compact == 1) {
314 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND
316 || kind == PyUnicode_2BYTE_KIND
317 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200320 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100321 }
322 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 assert(ascii->length == 0);
328 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert(ascii->state.compact == 0);
330 assert(ascii->state.ascii == 0);
331 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100332 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->wstr != NULL);
334 assert(data == NULL);
335 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 }
337 else {
338 assert(kind == PyUnicode_1BYTE_KIND
339 || kind == PyUnicode_2BYTE_KIND
340 || kind == PyUnicode_4BYTE_KIND);
341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ready == 1);
343 assert(data != NULL);
344 if (ascii->state.ascii) {
345 assert (compact->utf8 == data);
346 assert (compact->utf8_length == ascii->length);
347 }
348 else
349 assert (compact->utf8 != data);
350 }
351 }
352 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200353 if (
354#if SIZEOF_WCHAR_T == 2
355 kind == PyUnicode_2BYTE_KIND
356#else
357 kind == PyUnicode_4BYTE_KIND
358#endif
359 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200360 {
361 assert(ascii->wstr == data);
362 assert(compact->wstr_length == ascii->length);
363 } else
364 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366
367 if (compact->utf8 == NULL)
368 assert(compact->utf8_length == 0);
369 if (ascii->wstr == NULL)
370 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200372 /* check that the best kind is used */
373 if (check_content && kind != PyUnicode_WCHAR_KIND)
374 {
375 Py_ssize_t i;
376 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200377 void *data;
378 Py_UCS4 ch;
379
380 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 for (i=0; i < ascii->length; i++)
382 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200383 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 if (ch > maxchar)
385 maxchar = ch;
386 }
387 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100388 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200389 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100390 assert(maxchar <= 255);
391 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 else
393 assert(maxchar < 128);
394 }
Victor Stinner77faf692011-11-20 18:56:05 +0100395 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100397 assert(maxchar <= 0xFFFF);
398 }
399 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100401 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100402 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200403 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400405 return 1;
406}
Victor Stinner910337b2011-10-03 03:20:16 +0200407#endif
408
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100409static PyObject*
410unicode_result_wchar(PyObject *unicode)
411{
412#ifndef Py_DEBUG
413 Py_ssize_t len;
414
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100415 len = _PyUnicode_WSTR_LENGTH(unicode);
416 if (len == 0) {
417 Py_INCREF(unicode_empty);
418 Py_DECREF(unicode);
419 return unicode_empty;
420 }
421
422 if (len == 1) {
423 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
424 if (ch < 256) {
425 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
426 Py_DECREF(unicode);
427 return latin1_char;
428 }
429 }
430
431 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200432 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 return NULL;
434 }
435#else
Victor Stinneraa771272012-10-04 02:32:58 +0200436 assert(Py_REFCNT(unicode) == 1);
437
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100438 /* don't make the result ready in debug mode to ensure that the caller
439 makes the string ready before using it */
440 assert(_PyUnicode_CheckConsistency(unicode, 1));
441#endif
442 return unicode;
443}
444
445static PyObject*
446unicode_result_ready(PyObject *unicode)
447{
448 Py_ssize_t length;
449
450 length = PyUnicode_GET_LENGTH(unicode);
451 if (length == 0) {
452 if (unicode != unicode_empty) {
453 Py_INCREF(unicode_empty);
454 Py_DECREF(unicode);
455 }
456 return unicode_empty;
457 }
458
459 if (length == 1) {
460 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
461 if (ch < 256) {
462 PyObject *latin1_char = unicode_latin1[ch];
463 if (latin1_char != NULL) {
464 if (unicode != latin1_char) {
465 Py_INCREF(latin1_char);
466 Py_DECREF(unicode);
467 }
468 return latin1_char;
469 }
470 else {
471 assert(_PyUnicode_CheckConsistency(unicode, 1));
472 Py_INCREF(unicode);
473 unicode_latin1[ch] = unicode;
474 return unicode;
475 }
476 }
477 }
478
479 assert(_PyUnicode_CheckConsistency(unicode, 1));
480 return unicode;
481}
482
483static PyObject*
484unicode_result(PyObject *unicode)
485{
486 assert(_PyUnicode_CHECK(unicode));
487 if (PyUnicode_IS_READY(unicode))
488 return unicode_result_ready(unicode);
489 else
490 return unicode_result_wchar(unicode);
491}
492
Victor Stinnerc4b49542011-12-11 22:44:26 +0100493static PyObject*
494unicode_result_unchanged(PyObject *unicode)
495{
496 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500497 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100498 return NULL;
499 Py_INCREF(unicode);
500 return unicode;
501 }
502 else
503 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100504 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100505}
506
Victor Stinner3a50e702011-10-18 21:21:00 +0200507#ifdef HAVE_MBCS
508static OSVERSIONINFOEX winver;
509#endif
510
Thomas Wouters477c8d52006-05-27 19:21:47 +0000511/* --- Bloom Filters ----------------------------------------------------- */
512
513/* stuff to implement simple "bloom filters" for Unicode characters.
514 to keep things simple, we use a single bitmask, using the least 5
515 bits from each unicode characters as the bit index. */
516
517/* the linebreak mask is set up by Unicode_Init below */
518
Antoine Pitrouf068f942010-01-13 14:19:12 +0000519#if LONG_BIT >= 128
520#define BLOOM_WIDTH 128
521#elif LONG_BIT >= 64
522#define BLOOM_WIDTH 64
523#elif LONG_BIT >= 32
524#define BLOOM_WIDTH 32
525#else
526#error "LONG_BIT is smaller than 32"
527#endif
528
Thomas Wouters477c8d52006-05-27 19:21:47 +0000529#define BLOOM_MASK unsigned long
530
531static BLOOM_MASK bloom_linebreak;
532
Antoine Pitrouf068f942010-01-13 14:19:12 +0000533#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
534#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536#define BLOOM_LINEBREAK(ch) \
537 ((ch) < 128U ? ascii_linebreak[(ch)] : \
538 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539
Alexander Belopolsky40018472011-02-26 01:02:56 +0000540Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200541make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542{
543 /* calculate simple bloom-style bitmask for a given unicode string */
544
Antoine Pitrouf068f942010-01-13 14:19:12 +0000545 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546 Py_ssize_t i;
547
548 mask = 0;
549 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
552 return mask;
553}
554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555#define BLOOM_MEMBER(mask, chr, str) \
556 (BLOOM(mask, chr) \
557 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000558
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200559/* Compilation of templated routines */
560
561#include "stringlib/asciilib.h"
562#include "stringlib/fastsearch.h"
563#include "stringlib/partition.h"
564#include "stringlib/split.h"
565#include "stringlib/count.h"
566#include "stringlib/find.h"
567#include "stringlib/find_max_char.h"
568#include "stringlib/localeutil.h"
569#include "stringlib/undef.h"
570
571#include "stringlib/ucs1lib.h"
572#include "stringlib/fastsearch.h"
573#include "stringlib/partition.h"
574#include "stringlib/split.h"
575#include "stringlib/count.h"
576#include "stringlib/find.h"
577#include "stringlib/find_max_char.h"
578#include "stringlib/localeutil.h"
579#include "stringlib/undef.h"
580
581#include "stringlib/ucs2lib.h"
582#include "stringlib/fastsearch.h"
583#include "stringlib/partition.h"
584#include "stringlib/split.h"
585#include "stringlib/count.h"
586#include "stringlib/find.h"
587#include "stringlib/find_max_char.h"
588#include "stringlib/localeutil.h"
589#include "stringlib/undef.h"
590
591#include "stringlib/ucs4lib.h"
592#include "stringlib/fastsearch.h"
593#include "stringlib/partition.h"
594#include "stringlib/split.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
597#include "stringlib/find_max_char.h"
598#include "stringlib/localeutil.h"
599#include "stringlib/undef.h"
600
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200601#include "stringlib/unicodedefs.h"
602#include "stringlib/fastsearch.h"
603#include "stringlib/count.h"
604#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100605#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200606
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607/* --- Unicode Object ----------------------------------------------------- */
608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200609static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200610fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
613 Py_ssize_t size, Py_UCS4 ch,
614 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200616 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
617
618 switch (kind) {
619 case PyUnicode_1BYTE_KIND:
620 {
621 Py_UCS1 ch1 = (Py_UCS1) ch;
622 if (ch1 == ch)
623 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
624 else
625 return -1;
626 }
627 case PyUnicode_2BYTE_KIND:
628 {
629 Py_UCS2 ch2 = (Py_UCS2) ch;
630 if (ch2 == ch)
631 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
632 else
633 return -1;
634 }
635 case PyUnicode_4BYTE_KIND:
636 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
637 default:
638 assert(0);
639 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641}
642
Victor Stinnerafffce42012-10-03 23:03:17 +0200643#ifdef Py_DEBUG
644/* Fill the data of an Unicode string with invalid characters to detect bugs
645 earlier.
646
647 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
648 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
649 invalid character in Unicode 6.0. */
650static void
651unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
652{
653 int kind = PyUnicode_KIND(unicode);
654 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
655 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
656 if (length <= old_length)
657 return;
658 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
659}
660#endif
661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662static PyObject*
663resize_compact(PyObject *unicode, Py_ssize_t length)
664{
665 Py_ssize_t char_size;
666 Py_ssize_t struct_size;
667 Py_ssize_t new_size;
668 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100669 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200670#ifdef Py_DEBUG
671 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
672#endif
673
Victor Stinner79891572012-05-03 13:43:07 +0200674 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200675 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100676 assert(PyUnicode_IS_COMPACT(unicode));
677
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200678 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100679 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680 struct_size = sizeof(PyASCIIObject);
681 else
682 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200683 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684
Victor Stinnerfe226c02011-10-03 03:52:20 +0200685 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
686 PyErr_NoMemory();
687 return NULL;
688 }
689 new_size = (struct_size + (length + 1) * char_size);
690
Victor Stinner84def372011-12-11 20:04:56 +0100691 _Py_DEC_REFTOTAL;
692 _Py_ForgetReference(unicode);
693
694 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
695 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100696 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 PyErr_NoMemory();
698 return NULL;
699 }
Victor Stinner84def372011-12-11 20:04:56 +0100700 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200701 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100702
Victor Stinnerfe226c02011-10-03 03:52:20 +0200703 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200704 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200705 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100706 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200707 _PyUnicode_WSTR_LENGTH(unicode) = length;
708 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200709#ifdef Py_DEBUG
710 unicode_fill_invalid(unicode, old_length);
711#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200712 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
713 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200714 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715 return unicode;
716}
717
Alexander Belopolsky40018472011-02-26 01:02:56 +0000718static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200719resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000720{
Victor Stinner95663112011-10-04 01:03:50 +0200721 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100722 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200723 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200724 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000725
Victor Stinnerfe226c02011-10-03 03:52:20 +0200726 if (PyUnicode_IS_READY(unicode)) {
727 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200728 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200729 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200730#ifdef Py_DEBUG
731 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
732#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200733
734 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200735 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200736 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
737 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738
739 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
740 PyErr_NoMemory();
741 return -1;
742 }
743 new_size = (length + 1) * char_size;
744
Victor Stinner7a9105a2011-12-12 00:13:42 +0100745 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
746 {
747 PyObject_DEL(_PyUnicode_UTF8(unicode));
748 _PyUnicode_UTF8(unicode) = NULL;
749 _PyUnicode_UTF8_LENGTH(unicode) = 0;
750 }
751
Victor Stinnerfe226c02011-10-03 03:52:20 +0200752 data = (PyObject *)PyObject_REALLOC(data, new_size);
753 if (data == NULL) {
754 PyErr_NoMemory();
755 return -1;
756 }
757 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200758 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200760 _PyUnicode_WSTR_LENGTH(unicode) = length;
761 }
762 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200763 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200764 _PyUnicode_UTF8_LENGTH(unicode) = length;
765 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 _PyUnicode_LENGTH(unicode) = length;
767 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200768#ifdef Py_DEBUG
769 unicode_fill_invalid(unicode, old_length);
770#endif
Victor Stinner95663112011-10-04 01:03:50 +0200771 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200772 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200775 }
Victor Stinner95663112011-10-04 01:03:50 +0200776 assert(_PyUnicode_WSTR(unicode) != NULL);
777
778 /* check for integer overflow */
779 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
780 PyErr_NoMemory();
781 return -1;
782 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100783 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200784 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200786 if (!wstr) {
787 PyErr_NoMemory();
788 return -1;
789 }
790 _PyUnicode_WSTR(unicode) = wstr;
791 _PyUnicode_WSTR(unicode)[length] = 0;
792 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200793 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000794 return 0;
795}
796
Victor Stinnerfe226c02011-10-03 03:52:20 +0200797static PyObject*
798resize_copy(PyObject *unicode, Py_ssize_t length)
799{
800 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100801 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200802 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100803
Benjamin Petersonbac79492012-01-14 13:34:47 -0500804 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100805 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200806
807 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
808 if (copy == NULL)
809 return NULL;
810
811 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200812 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200814 }
815 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200816 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100817
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200818 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200819 if (w == NULL)
820 return NULL;
821 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
822 copy_length = Py_MIN(copy_length, length);
823 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
824 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200825 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200826 }
827}
828
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000830 Ux0000 terminated; some code (e.g. new_identifier)
831 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000832
833 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000834 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000835
836*/
837
Alexander Belopolsky40018472011-02-26 01:02:56 +0000838static PyUnicodeObject *
839_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840{
841 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000843
Thomas Wouters477c8d52006-05-27 19:21:47 +0000844 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000845 if (length == 0 && unicode_empty != NULL) {
846 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200847 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000848 }
849
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000850 /* Ensure we won't overflow the size. */
851 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
852 return (PyUnicodeObject *)PyErr_NoMemory();
853 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200854 if (length < 0) {
855 PyErr_SetString(PyExc_SystemError,
856 "Negative size passed to _PyUnicode_New");
857 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000858 }
859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200860 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
861 if (unicode == NULL)
862 return NULL;
863 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
864 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
865 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100866 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000867 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100868 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000869 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200870
Jeremy Hyltond8082792003-09-16 19:41:39 +0000871 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000872 * the caller fails before initializing str -- unicode_resize()
873 * reads str[0], and the Keep-Alive optimization can keep memory
874 * allocated for str alive across a call to unicode_dealloc(unicode).
875 * We don't want unicode_resize to read uninitialized memory in
876 * that case.
877 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200878 _PyUnicode_WSTR(unicode)[0] = 0;
879 _PyUnicode_WSTR(unicode)[length] = 0;
880 _PyUnicode_WSTR_LENGTH(unicode) = length;
881 _PyUnicode_HASH(unicode) = -1;
882 _PyUnicode_STATE(unicode).interned = 0;
883 _PyUnicode_STATE(unicode).kind = 0;
884 _PyUnicode_STATE(unicode).compact = 0;
885 _PyUnicode_STATE(unicode).ready = 0;
886 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200887 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200888 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200889 _PyUnicode_UTF8(unicode) = NULL;
890 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100891 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000892 return unicode;
893}
894
Victor Stinnerf42dc442011-10-02 23:33:16 +0200895static const char*
896unicode_kind_name(PyObject *unicode)
897{
Victor Stinner42dfd712011-10-03 14:41:45 +0200898 /* don't check consistency: unicode_kind_name() is called from
899 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200900 if (!PyUnicode_IS_COMPACT(unicode))
901 {
902 if (!PyUnicode_IS_READY(unicode))
903 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600904 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200905 {
906 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200907 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200908 return "legacy ascii";
909 else
910 return "legacy latin1";
911 case PyUnicode_2BYTE_KIND:
912 return "legacy UCS2";
913 case PyUnicode_4BYTE_KIND:
914 return "legacy UCS4";
915 default:
916 return "<legacy invalid kind>";
917 }
918 }
919 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600920 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200921 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200922 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200923 return "ascii";
924 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200925 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200926 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200927 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200928 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200929 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200930 default:
931 return "<invalid compact kind>";
932 }
933}
934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200935#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200936/* Functions wrapping macros for use in debugger */
937char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200938 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200939}
940
941void *_PyUnicode_compact_data(void *unicode) {
942 return _PyUnicode_COMPACT_DATA(unicode);
943}
944void *_PyUnicode_data(void *unicode){
945 printf("obj %p\n", unicode);
946 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
947 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
948 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
949 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
950 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
951 return PyUnicode_DATA(unicode);
952}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200953
954void
955_PyUnicode_Dump(PyObject *op)
956{
957 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200958 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
959 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
960 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200961
Victor Stinnera849a4b2011-10-03 12:12:11 +0200962 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200963 {
964 if (ascii->state.ascii)
965 data = (ascii + 1);
966 else
967 data = (compact + 1);
968 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200969 else
970 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200971 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
972
Victor Stinnera849a4b2011-10-03 12:12:11 +0200973 if (ascii->wstr == data)
974 printf("shared ");
975 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200976
Victor Stinnera3b334d2011-10-03 13:53:37 +0200977 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200978 printf(" (%zu), ", compact->wstr_length);
979 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
980 printf("shared ");
981 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200982 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200983 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200984}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985#endif
986
987PyObject *
988PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
989{
990 PyObject *obj;
991 PyCompactUnicodeObject *unicode;
992 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200993 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200994 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995 Py_ssize_t char_size;
996 Py_ssize_t struct_size;
997
998 /* Optimization for empty strings */
999 if (size == 0 && unicode_empty != NULL) {
1000 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001001 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001002 }
1003
Victor Stinner9e9d6892011-10-04 01:02:02 +02001004 is_ascii = 0;
1005 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001006 struct_size = sizeof(PyCompactUnicodeObject);
1007 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001008 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009 char_size = 1;
1010 is_ascii = 1;
1011 struct_size = sizeof(PyASCIIObject);
1012 }
1013 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001014 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001015 char_size = 1;
1016 }
1017 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001018 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001019 char_size = 2;
1020 if (sizeof(wchar_t) == 2)
1021 is_sharing = 1;
1022 }
1023 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001024 if (maxchar > MAX_UNICODE) {
1025 PyErr_SetString(PyExc_SystemError,
1026 "invalid maximum character passed to PyUnicode_New");
1027 return NULL;
1028 }
Victor Stinner8f825062012-04-27 13:55:39 +02001029 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030 char_size = 4;
1031 if (sizeof(wchar_t) == 4)
1032 is_sharing = 1;
1033 }
1034
1035 /* Ensure we won't overflow the size. */
1036 if (size < 0) {
1037 PyErr_SetString(PyExc_SystemError,
1038 "Negative size passed to PyUnicode_New");
1039 return NULL;
1040 }
1041 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1042 return PyErr_NoMemory();
1043
1044 /* Duplicated allocation code from _PyObject_New() instead of a call to
1045 * PyObject_New() so we are able to allocate space for the object and
1046 * it's data buffer.
1047 */
1048 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1049 if (obj == NULL)
1050 return PyErr_NoMemory();
1051 obj = PyObject_INIT(obj, &PyUnicode_Type);
1052 if (obj == NULL)
1053 return NULL;
1054
1055 unicode = (PyCompactUnicodeObject *)obj;
1056 if (is_ascii)
1057 data = ((PyASCIIObject*)obj) + 1;
1058 else
1059 data = unicode + 1;
1060 _PyUnicode_LENGTH(unicode) = size;
1061 _PyUnicode_HASH(unicode) = -1;
1062 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001063 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 _PyUnicode_STATE(unicode).compact = 1;
1065 _PyUnicode_STATE(unicode).ready = 1;
1066 _PyUnicode_STATE(unicode).ascii = is_ascii;
1067 if (is_ascii) {
1068 ((char*)data)[size] = 0;
1069 _PyUnicode_WSTR(unicode) = NULL;
1070 }
Victor Stinner8f825062012-04-27 13:55:39 +02001071 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 ((char*)data)[size] = 0;
1073 _PyUnicode_WSTR(unicode) = NULL;
1074 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001076 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078 else {
1079 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001080 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001081 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001083 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001084 ((Py_UCS4*)data)[size] = 0;
1085 if (is_sharing) {
1086 _PyUnicode_WSTR_LENGTH(unicode) = size;
1087 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1088 }
1089 else {
1090 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1091 _PyUnicode_WSTR(unicode) = NULL;
1092 }
1093 }
Victor Stinner8f825062012-04-27 13:55:39 +02001094#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001095 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001096#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001097 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001098 return obj;
1099}
1100
1101#if SIZEOF_WCHAR_T == 2
1102/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1103 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001104 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105
1106 This function assumes that unicode can hold one more code point than wstr
1107 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001108static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001110 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111{
1112 const wchar_t *iter;
1113 Py_UCS4 *ucs4_out;
1114
Victor Stinner910337b2011-10-03 03:20:16 +02001115 assert(unicode != NULL);
1116 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1118 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1119
1120 for (iter = begin; iter < end; ) {
1121 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1122 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001123 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1124 && (iter+1) < end
1125 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126 {
Victor Stinner551ac952011-11-29 22:58:13 +01001127 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128 iter += 2;
1129 }
1130 else {
1131 *ucs4_out++ = *iter;
1132 iter++;
1133 }
1134 }
1135 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1136 _PyUnicode_GET_LENGTH(unicode)));
1137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138}
1139#endif
1140
Victor Stinnercd9950f2011-10-02 00:34:53 +02001141static int
Victor Stinner488fa492011-12-12 00:01:39 +01001142unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001143{
Victor Stinner488fa492011-12-12 00:01:39 +01001144 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001145 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001146 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001147 return -1;
1148 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001149 return 0;
1150}
1151
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152static int
1153_copy_characters(PyObject *to, Py_ssize_t to_start,
1154 PyObject *from, Py_ssize_t from_start,
1155 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001157 unsigned int from_kind, to_kind;
1158 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159
Victor Stinneree4544c2012-05-09 22:24:08 +02001160 assert(0 <= how_many);
1161 assert(0 <= from_start);
1162 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001163 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001164 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001165 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166
Victor Stinnerd3f08822012-05-29 12:57:52 +02001167 assert(PyUnicode_Check(to));
1168 assert(PyUnicode_IS_READY(to));
1169 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1170
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001171 if (how_many == 0)
1172 return 0;
1173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001174 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001175 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001177 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178
Victor Stinnerf1852262012-06-16 16:38:26 +02001179#ifdef Py_DEBUG
1180 if (!check_maxchar
1181 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1182 {
1183 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1184 Py_UCS4 ch;
1185 Py_ssize_t i;
1186 for (i=0; i < how_many; i++) {
1187 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1188 assert(ch <= to_maxchar);
1189 }
1190 }
1191#endif
1192
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001193 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001194 if (check_maxchar
1195 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1196 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001197 /* Writing Latin-1 characters into an ASCII string requires to
1198 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001199 Py_UCS4 max_char;
1200 max_char = ucs1lib_find_max_char(from_data,
1201 (Py_UCS1*)from_data + how_many);
1202 if (max_char >= 128)
1203 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001204 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001205 Py_MEMCPY((char*)to_data + to_kind * to_start,
1206 (char*)from_data + from_kind * from_start,
1207 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001209 else if (from_kind == PyUnicode_1BYTE_KIND
1210 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001211 {
1212 _PyUnicode_CONVERT_BYTES(
1213 Py_UCS1, Py_UCS2,
1214 PyUnicode_1BYTE_DATA(from) + from_start,
1215 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1216 PyUnicode_2BYTE_DATA(to) + to_start
1217 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001218 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001219 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001220 && to_kind == PyUnicode_4BYTE_KIND)
1221 {
1222 _PyUnicode_CONVERT_BYTES(
1223 Py_UCS1, Py_UCS4,
1224 PyUnicode_1BYTE_DATA(from) + from_start,
1225 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1226 PyUnicode_4BYTE_DATA(to) + to_start
1227 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001228 }
1229 else if (from_kind == PyUnicode_2BYTE_KIND
1230 && to_kind == PyUnicode_4BYTE_KIND)
1231 {
1232 _PyUnicode_CONVERT_BYTES(
1233 Py_UCS2, Py_UCS4,
1234 PyUnicode_2BYTE_DATA(from) + from_start,
1235 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1236 PyUnicode_4BYTE_DATA(to) + to_start
1237 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001238 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001239 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001240 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1241
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001242 if (!check_maxchar) {
1243 if (from_kind == PyUnicode_2BYTE_KIND
1244 && to_kind == PyUnicode_1BYTE_KIND)
1245 {
1246 _PyUnicode_CONVERT_BYTES(
1247 Py_UCS2, Py_UCS1,
1248 PyUnicode_2BYTE_DATA(from) + from_start,
1249 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1250 PyUnicode_1BYTE_DATA(to) + to_start
1251 );
1252 }
1253 else if (from_kind == PyUnicode_4BYTE_KIND
1254 && to_kind == PyUnicode_1BYTE_KIND)
1255 {
1256 _PyUnicode_CONVERT_BYTES(
1257 Py_UCS4, Py_UCS1,
1258 PyUnicode_4BYTE_DATA(from) + from_start,
1259 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1260 PyUnicode_1BYTE_DATA(to) + to_start
1261 );
1262 }
1263 else if (from_kind == PyUnicode_4BYTE_KIND
1264 && to_kind == PyUnicode_2BYTE_KIND)
1265 {
1266 _PyUnicode_CONVERT_BYTES(
1267 Py_UCS4, Py_UCS2,
1268 PyUnicode_4BYTE_DATA(from) + from_start,
1269 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1270 PyUnicode_2BYTE_DATA(to) + to_start
1271 );
1272 }
1273 else {
1274 assert(0);
1275 return -1;
1276 }
1277 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001278 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001279 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001280 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001281 Py_ssize_t i;
1282
Victor Stinnera0702ab2011-09-29 14:14:38 +02001283 for (i=0; i < how_many; i++) {
1284 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 if (ch > to_maxchar)
1286 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001287 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1288 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001289 }
1290 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001291 return 0;
1292}
1293
Victor Stinnerd3f08822012-05-29 12:57:52 +02001294void
1295_PyUnicode_FastCopyCharacters(
1296 PyObject *to, Py_ssize_t to_start,
1297 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001298{
1299 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1300}
1301
1302Py_ssize_t
1303PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1304 PyObject *from, Py_ssize_t from_start,
1305 Py_ssize_t how_many)
1306{
1307 int err;
1308
1309 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1310 PyErr_BadInternalCall();
1311 return -1;
1312 }
1313
Benjamin Petersonbac79492012-01-14 13:34:47 -05001314 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001315 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001316 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001317 return -1;
1318
Victor Stinnerd3f08822012-05-29 12:57:52 +02001319 if (from_start < 0) {
1320 PyErr_SetString(PyExc_IndexError, "string index out of range");
1321 return -1;
1322 }
1323 if (to_start < 0) {
1324 PyErr_SetString(PyExc_IndexError, "string index out of range");
1325 return -1;
1326 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001327 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1328 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1329 PyErr_Format(PyExc_SystemError,
1330 "Cannot write %zi characters at %zi "
1331 "in a string of %zi characters",
1332 how_many, to_start, PyUnicode_GET_LENGTH(to));
1333 return -1;
1334 }
1335
1336 if (how_many == 0)
1337 return 0;
1338
Victor Stinner488fa492011-12-12 00:01:39 +01001339 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001340 return -1;
1341
1342 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1343 if (err) {
1344 PyErr_Format(PyExc_SystemError,
1345 "Cannot copy %s characters "
1346 "into a string of %s characters",
1347 unicode_kind_name(from),
1348 unicode_kind_name(to));
1349 return -1;
1350 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001351 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001352}
1353
Victor Stinner17222162011-09-28 22:15:37 +02001354/* Find the maximum code point and count the number of surrogate pairs so a
1355 correct string length can be computed before converting a string to UCS4.
1356 This function counts single surrogates as a character and not as a pair.
1357
1358 Return 0 on success, or -1 on error. */
1359static int
1360find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1361 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362{
1363 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001364 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365
Victor Stinnerc53be962011-10-02 21:33:54 +02001366 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 *num_surrogates = 0;
1368 *maxchar = 0;
1369
1370 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001372 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1373 && (iter+1) < end
1374 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001376 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378 iter += 2;
1379 }
1380 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001382 {
1383 ch = *iter;
1384 iter++;
1385 }
1386 if (ch > *maxchar) {
1387 *maxchar = ch;
1388 if (*maxchar > MAX_UNICODE) {
1389 PyErr_Format(PyExc_ValueError,
1390 "character U+%x is not in range [U+0000; U+10ffff]",
1391 ch);
1392 return -1;
1393 }
1394 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 }
1396 return 0;
1397}
1398
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001399int
1400_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401{
1402 wchar_t *end;
1403 Py_UCS4 maxchar = 0;
1404 Py_ssize_t num_surrogates;
1405#if SIZEOF_WCHAR_T == 2
1406 Py_ssize_t length_wo_surrogates;
1407#endif
1408
Georg Brandl7597add2011-10-05 16:36:47 +02001409 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001410 strings were created using _PyObject_New() and where no canonical
1411 representation (the str field) has been set yet aka strings
1412 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001413 assert(_PyUnicode_CHECK(unicode));
1414 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001416 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001417 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001418 /* Actually, it should neither be interned nor be anything else: */
1419 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001422 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001423 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425
1426 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001427 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1428 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429 PyErr_NoMemory();
1430 return -1;
1431 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001432 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 _PyUnicode_WSTR(unicode), end,
1434 PyUnicode_1BYTE_DATA(unicode));
1435 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1436 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1437 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1438 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001439 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001440 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001441 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 }
1443 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001444 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001445 _PyUnicode_UTF8(unicode) = NULL;
1446 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 }
1448 PyObject_FREE(_PyUnicode_WSTR(unicode));
1449 _PyUnicode_WSTR(unicode) = NULL;
1450 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1451 }
1452 /* In this case we might have to convert down from 4-byte native
1453 wchar_t to 2-byte unicode. */
1454 else if (maxchar < 65536) {
1455 assert(num_surrogates == 0 &&
1456 "FindMaxCharAndNumSurrogatePairs() messed up");
1457
Victor Stinner506f5922011-09-28 22:34:18 +02001458#if SIZEOF_WCHAR_T == 2
1459 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001460 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001461 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1462 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1463 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001464 _PyUnicode_UTF8(unicode) = NULL;
1465 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001466#else
1467 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001468 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001469 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001471 PyErr_NoMemory();
1472 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 }
Victor Stinner506f5922011-09-28 22:34:18 +02001474 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1475 _PyUnicode_WSTR(unicode), end,
1476 PyUnicode_2BYTE_DATA(unicode));
1477 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1478 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1479 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001480 _PyUnicode_UTF8(unicode) = NULL;
1481 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001482 PyObject_FREE(_PyUnicode_WSTR(unicode));
1483 _PyUnicode_WSTR(unicode) = NULL;
1484 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1485#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486 }
1487 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1488 else {
1489#if SIZEOF_WCHAR_T == 2
1490 /* in case the native representation is 2-bytes, we need to allocate a
1491 new normalized 4-byte version. */
1492 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001493 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1494 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495 PyErr_NoMemory();
1496 return -1;
1497 }
1498 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1499 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001500 _PyUnicode_UTF8(unicode) = NULL;
1501 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001502 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1503 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001504 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001505 PyObject_FREE(_PyUnicode_WSTR(unicode));
1506 _PyUnicode_WSTR(unicode) = NULL;
1507 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1508#else
1509 assert(num_surrogates == 0);
1510
Victor Stinnerc3c74152011-10-02 20:39:55 +02001511 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001512 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001513 _PyUnicode_UTF8(unicode) = NULL;
1514 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1516#endif
1517 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1518 }
1519 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001520 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521 return 0;
1522}
1523
Alexander Belopolsky40018472011-02-26 01:02:56 +00001524static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001525unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526{
Walter Dörwald16807132007-05-25 13:52:07 +00001527 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001528 case SSTATE_NOT_INTERNED:
1529 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001530
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 case SSTATE_INTERNED_MORTAL:
1532 /* revive dead object temporarily for DelItem */
1533 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001534 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001535 Py_FatalError(
1536 "deletion of interned string failed");
1537 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001538
Benjamin Peterson29060642009-01-31 22:14:21 +00001539 case SSTATE_INTERNED_IMMORTAL:
1540 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001541
Benjamin Peterson29060642009-01-31 22:14:21 +00001542 default:
1543 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001544 }
1545
Victor Stinner03490912011-10-03 23:45:12 +02001546 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001548 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001549 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001550 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1551 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001552
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001553 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554}
1555
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001556#ifdef Py_DEBUG
1557static int
1558unicode_is_singleton(PyObject *unicode)
1559{
1560 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1561 if (unicode == unicode_empty)
1562 return 1;
1563 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1564 {
1565 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1566 if (ch < 256 && unicode_latin1[ch] == unicode)
1567 return 1;
1568 }
1569 return 0;
1570}
1571#endif
1572
Alexander Belopolsky40018472011-02-26 01:02:56 +00001573static int
Victor Stinner488fa492011-12-12 00:01:39 +01001574unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001575{
Victor Stinner488fa492011-12-12 00:01:39 +01001576 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001577 if (Py_REFCNT(unicode) != 1)
1578 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001579 if (_PyUnicode_HASH(unicode) != -1)
1580 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 if (PyUnicode_CHECK_INTERNED(unicode))
1582 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001583 if (!PyUnicode_CheckExact(unicode))
1584 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001585#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001586 /* singleton refcount is greater than 1 */
1587 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001588#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001589 return 1;
1590}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001591
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592static int
1593unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1594{
1595 PyObject *unicode;
1596 Py_ssize_t old_length;
1597
1598 assert(p_unicode != NULL);
1599 unicode = *p_unicode;
1600
1601 assert(unicode != NULL);
1602 assert(PyUnicode_Check(unicode));
1603 assert(0 <= length);
1604
Victor Stinner910337b2011-10-03 03:20:16 +02001605 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001606 old_length = PyUnicode_WSTR_LENGTH(unicode);
1607 else
1608 old_length = PyUnicode_GET_LENGTH(unicode);
1609 if (old_length == length)
1610 return 0;
1611
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001612 if (length == 0) {
1613 Py_DECREF(*p_unicode);
1614 *p_unicode = unicode_empty;
1615 Py_INCREF(*p_unicode);
1616 return 0;
1617 }
1618
Victor Stinner488fa492011-12-12 00:01:39 +01001619 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 PyObject *copy = resize_copy(unicode, length);
1621 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001622 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 Py_DECREF(*p_unicode);
1624 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001625 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001626 }
1627
Victor Stinnerfe226c02011-10-03 03:52:20 +02001628 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001629 PyObject *new_unicode = resize_compact(unicode, length);
1630 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001632 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001633 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001634 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001635 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001636}
1637
Alexander Belopolsky40018472011-02-26 01:02:56 +00001638int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001639PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001640{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001641 PyObject *unicode;
1642 if (p_unicode == NULL) {
1643 PyErr_BadInternalCall();
1644 return -1;
1645 }
1646 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001647 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001648 {
1649 PyErr_BadInternalCall();
1650 return -1;
1651 }
1652 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001653}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001654
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001656unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1657 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001658{
1659 PyObject *result;
1660 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001661 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001662 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1663 return 0;
1664 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1665 maxchar);
1666 if (result == NULL)
1667 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001668 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001669 Py_DECREF(*p_unicode);
1670 *p_unicode = result;
1671 return 0;
1672}
1673
1674static int
1675unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1676 Py_UCS4 ch)
1677{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001678 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001679 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001680 return -1;
1681 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1682 PyUnicode_DATA(*p_unicode),
1683 (*pos)++, ch);
1684 return 0;
1685}
1686
Victor Stinnerc5166102012-02-22 13:55:02 +01001687/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001688
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001689 WARNING: The function doesn't copy the terminating null character and
1690 doesn't check the maximum character (may write a latin1 character in an
1691 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001692static void
1693unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1694 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001695{
1696 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1697 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001698 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001699
1700 switch (kind) {
1701 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001702 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001703#ifdef Py_DEBUG
1704 if (PyUnicode_IS_ASCII(unicode)) {
1705 Py_UCS4 maxchar = ucs1lib_find_max_char(
1706 (const Py_UCS1*)str,
1707 (const Py_UCS1*)str + len);
1708 assert(maxchar < 128);
1709 }
1710#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001711 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001712 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001713 }
1714 case PyUnicode_2BYTE_KIND: {
1715 Py_UCS2 *start = (Py_UCS2 *)data + index;
1716 Py_UCS2 *ucs2 = start;
1717 assert(index <= PyUnicode_GET_LENGTH(unicode));
1718
Victor Stinner184252a2012-06-16 02:57:41 +02001719 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001720 *ucs2 = (Py_UCS2)*str;
1721
1722 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001723 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001724 }
1725 default: {
1726 Py_UCS4 *start = (Py_UCS4 *)data + index;
1727 Py_UCS4 *ucs4 = start;
1728 assert(kind == PyUnicode_4BYTE_KIND);
1729 assert(index <= PyUnicode_GET_LENGTH(unicode));
1730
Victor Stinner184252a2012-06-16 02:57:41 +02001731 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001732 *ucs4 = (Py_UCS4)*str;
1733
1734 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001735 }
1736 }
1737}
1738
1739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740static PyObject*
1741get_latin1_char(unsigned char ch)
1742{
Victor Stinnera464fc12011-10-02 20:39:30 +02001743 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001745 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746 if (!unicode)
1747 return NULL;
1748 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001749 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 unicode_latin1[ch] = unicode;
1751 }
1752 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001753 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754}
1755
Alexander Belopolsky40018472011-02-26 01:02:56 +00001756PyObject *
1757PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001759 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 Py_UCS4 maxchar = 0;
1761 Py_ssize_t num_surrogates;
1762
1763 if (u == NULL)
1764 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001766 /* If the Unicode data is known at construction time, we can apply
1767 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 /* Optimization for empty strings */
1770 if (size == 0 && unicode_empty != NULL) {
1771 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001772 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001773 }
Tim Petersced69f82003-09-16 20:30:58 +00001774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 /* Single character Unicode objects in the Latin-1 range are
1776 shared when using this constructor */
1777 if (size == 1 && *u < 256)
1778 return get_latin1_char((unsigned char)*u);
1779
1780 /* If not empty and not single character, copy the Unicode data
1781 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001782 if (find_maxchar_surrogates(u, u + size,
1783 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 return NULL;
1785
Victor Stinner8faf8212011-12-08 22:14:11 +01001786 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 if (!unicode)
1788 return NULL;
1789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 switch (PyUnicode_KIND(unicode)) {
1791 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001792 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1794 break;
1795 case PyUnicode_2BYTE_KIND:
1796#if Py_UNICODE_SIZE == 2
1797 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1798#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001799 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1801#endif
1802 break;
1803 case PyUnicode_4BYTE_KIND:
1804#if SIZEOF_WCHAR_T == 2
1805 /* This is the only case which has to process surrogates, thus
1806 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001807 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808#else
1809 assert(num_surrogates == 0);
1810 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1811#endif
1812 break;
1813 default:
1814 assert(0 && "Impossible state");
1815 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001817 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001818}
1819
Alexander Belopolsky40018472011-02-26 01:02:56 +00001820PyObject *
1821PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001822{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001823 if (size < 0) {
1824 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001825 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001826 return NULL;
1827 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001828 if (u != NULL)
1829 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1830 else
1831 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001832}
1833
Alexander Belopolsky40018472011-02-26 01:02:56 +00001834PyObject *
1835PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001836{
1837 size_t size = strlen(u);
1838 if (size > PY_SSIZE_T_MAX) {
1839 PyErr_SetString(PyExc_OverflowError, "input too long");
1840 return NULL;
1841 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001842 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001843}
1844
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001845PyObject *
1846_PyUnicode_FromId(_Py_Identifier *id)
1847{
1848 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001849 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1850 strlen(id->string),
1851 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001852 if (!id->object)
1853 return NULL;
1854 PyUnicode_InternInPlace(&id->object);
1855 assert(!id->next);
1856 id->next = static_strings;
1857 static_strings = id;
1858 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001859 return id->object;
1860}
1861
1862void
1863_PyUnicode_ClearStaticStrings()
1864{
1865 _Py_Identifier *i;
1866 for (i = static_strings; i; i = i->next) {
1867 Py_DECREF(i->object);
1868 i->object = NULL;
1869 i->next = NULL;
1870 }
1871}
1872
Benjamin Peterson0df54292012-03-26 14:50:32 -04001873/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001874
Victor Stinnerd3f08822012-05-29 12:57:52 +02001875PyObject*
1876_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001877{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001878 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001879 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001880 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001881#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001882 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001883#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001884 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001885 }
Victor Stinner785938e2011-12-11 20:09:03 +01001886 unicode = PyUnicode_New(size, 127);
1887 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001888 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001889 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1890 assert(_PyUnicode_CheckConsistency(unicode, 1));
1891 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001892}
1893
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001894static Py_UCS4
1895kind_maxchar_limit(unsigned int kind)
1896{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001897 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001898 case PyUnicode_1BYTE_KIND:
1899 return 0x80;
1900 case PyUnicode_2BYTE_KIND:
1901 return 0x100;
1902 case PyUnicode_4BYTE_KIND:
1903 return 0x10000;
1904 default:
1905 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001906 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001907 }
1908}
1909
Victor Stinnere6abb482012-05-02 01:15:40 +02001910Py_LOCAL_INLINE(Py_UCS4)
1911align_maxchar(Py_UCS4 maxchar)
1912{
1913 if (maxchar <= 127)
1914 return 127;
1915 else if (maxchar <= 255)
1916 return 255;
1917 else if (maxchar <= 65535)
1918 return 65535;
1919 else
1920 return MAX_UNICODE;
1921}
1922
Victor Stinner702c7342011-10-05 13:50:52 +02001923static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001924_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001927 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001928
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001929 if (size == 0) {
1930 Py_INCREF(unicode_empty);
1931 return unicode_empty;
1932 }
1933 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001934 if (size == 1)
1935 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001936
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001937 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001938 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 if (!res)
1940 return NULL;
1941 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001942 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001944}
1945
Victor Stinnere57b1c02011-09-28 22:20:48 +02001946static PyObject*
1947_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948{
1949 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001950 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001951
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001952 if (size == 0) {
1953 Py_INCREF(unicode_empty);
1954 return unicode_empty;
1955 }
1956 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001957 if (size == 1) {
1958 Py_UCS4 ch = u[0];
1959 if (ch < 256)
1960 return get_latin1_char((unsigned char)ch);
1961
1962 res = PyUnicode_New(1, ch);
1963 if (res == NULL)
1964 return NULL;
1965 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1966 assert(_PyUnicode_CheckConsistency(res, 1));
1967 return res;
1968 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001969
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001970 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001971 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001972 if (!res)
1973 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001974 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001975 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001976 else {
1977 _PyUnicode_CONVERT_BYTES(
1978 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1979 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001980 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 return res;
1982}
1983
Victor Stinnere57b1c02011-09-28 22:20:48 +02001984static PyObject*
1985_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986{
1987 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001988 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001989
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001990 if (size == 0) {
1991 Py_INCREF(unicode_empty);
1992 return unicode_empty;
1993 }
1994 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001995 if (size == 1) {
1996 Py_UCS4 ch = u[0];
1997 if (ch < 256)
1998 return get_latin1_char((unsigned char)ch);
1999
2000 res = PyUnicode_New(1, ch);
2001 if (res == NULL)
2002 return NULL;
2003 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
2004 assert(_PyUnicode_CheckConsistency(res, 1));
2005 return res;
2006 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002007
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002008 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002009 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 if (!res)
2011 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002012 if (max_char < 256)
2013 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2014 PyUnicode_1BYTE_DATA(res));
2015 else if (max_char < 0x10000)
2016 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2017 PyUnicode_2BYTE_DATA(res));
2018 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002020 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 return res;
2022}
2023
2024PyObject*
2025PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2026{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002027 if (size < 0) {
2028 PyErr_SetString(PyExc_ValueError, "size must be positive");
2029 return NULL;
2030 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002031 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002033 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002035 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002037 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002038 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002039 PyErr_SetString(PyExc_SystemError, "invalid kind");
2040 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042}
2043
Victor Stinnerece58de2012-04-23 23:36:38 +02002044Py_UCS4
2045_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2046{
2047 enum PyUnicode_Kind kind;
2048 void *startptr, *endptr;
2049
2050 assert(PyUnicode_IS_READY(unicode));
2051 assert(0 <= start);
2052 assert(end <= PyUnicode_GET_LENGTH(unicode));
2053 assert(start <= end);
2054
2055 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2056 return PyUnicode_MAX_CHAR_VALUE(unicode);
2057
2058 if (start == end)
2059 return 127;
2060
Victor Stinner94d558b2012-04-27 22:26:58 +02002061 if (PyUnicode_IS_ASCII(unicode))
2062 return 127;
2063
Victor Stinnerece58de2012-04-23 23:36:38 +02002064 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002065 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002066 endptr = (char *)startptr + end * kind;
2067 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002068 switch(kind) {
2069 case PyUnicode_1BYTE_KIND:
2070 return ucs1lib_find_max_char(startptr, endptr);
2071 case PyUnicode_2BYTE_KIND:
2072 return ucs2lib_find_max_char(startptr, endptr);
2073 case PyUnicode_4BYTE_KIND:
2074 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002075 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002076 assert(0);
2077 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002078 }
2079}
2080
Victor Stinner25a4b292011-10-06 12:31:55 +02002081/* Ensure that a string uses the most efficient storage, if it is not the
2082 case: create a new string with of the right kind. Write NULL into *p_unicode
2083 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002084static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002085unicode_adjust_maxchar(PyObject **p_unicode)
2086{
2087 PyObject *unicode, *copy;
2088 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002089 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002090 unsigned int kind;
2091
2092 assert(p_unicode != NULL);
2093 unicode = *p_unicode;
2094 assert(PyUnicode_IS_READY(unicode));
2095 if (PyUnicode_IS_ASCII(unicode))
2096 return;
2097
2098 len = PyUnicode_GET_LENGTH(unicode);
2099 kind = PyUnicode_KIND(unicode);
2100 if (kind == PyUnicode_1BYTE_KIND) {
2101 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002102 max_char = ucs1lib_find_max_char(u, u + len);
2103 if (max_char >= 128)
2104 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002105 }
2106 else if (kind == PyUnicode_2BYTE_KIND) {
2107 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002108 max_char = ucs2lib_find_max_char(u, u + len);
2109 if (max_char >= 256)
2110 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002111 }
2112 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002113 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002114 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002115 max_char = ucs4lib_find_max_char(u, u + len);
2116 if (max_char >= 0x10000)
2117 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002118 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002119 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002120 if (copy != NULL)
2121 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002122 Py_DECREF(unicode);
2123 *p_unicode = copy;
2124}
2125
Victor Stinner034f6cf2011-09-30 02:26:44 +02002126PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002127_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002128{
Victor Stinner87af4f22011-11-21 23:03:47 +01002129 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002130 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002131
Victor Stinner034f6cf2011-09-30 02:26:44 +02002132 if (!PyUnicode_Check(unicode)) {
2133 PyErr_BadInternalCall();
2134 return NULL;
2135 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002136 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002137 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002138
Victor Stinner87af4f22011-11-21 23:03:47 +01002139 length = PyUnicode_GET_LENGTH(unicode);
2140 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002141 if (!copy)
2142 return NULL;
2143 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2144
Victor Stinner87af4f22011-11-21 23:03:47 +01002145 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2146 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002147 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002148 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002149}
2150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151
Victor Stinnerbc603d12011-10-02 01:00:40 +02002152/* Widen Unicode objects to larger buffers. Don't write terminating null
2153 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002154
2155void*
2156_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2157{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002158 Py_ssize_t len;
2159 void *result;
2160 unsigned int skind;
2161
Benjamin Petersonbac79492012-01-14 13:34:47 -05002162 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002163 return NULL;
2164
2165 len = PyUnicode_GET_LENGTH(s);
2166 skind = PyUnicode_KIND(s);
2167 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002168 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 return NULL;
2170 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002171 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002172 case PyUnicode_2BYTE_KIND:
2173 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2174 if (!result)
2175 return PyErr_NoMemory();
2176 assert(skind == PyUnicode_1BYTE_KIND);
2177 _PyUnicode_CONVERT_BYTES(
2178 Py_UCS1, Py_UCS2,
2179 PyUnicode_1BYTE_DATA(s),
2180 PyUnicode_1BYTE_DATA(s) + len,
2181 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002183 case PyUnicode_4BYTE_KIND:
2184 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2185 if (!result)
2186 return PyErr_NoMemory();
2187 if (skind == PyUnicode_2BYTE_KIND) {
2188 _PyUnicode_CONVERT_BYTES(
2189 Py_UCS2, Py_UCS4,
2190 PyUnicode_2BYTE_DATA(s),
2191 PyUnicode_2BYTE_DATA(s) + len,
2192 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002194 else {
2195 assert(skind == PyUnicode_1BYTE_KIND);
2196 _PyUnicode_CONVERT_BYTES(
2197 Py_UCS1, Py_UCS4,
2198 PyUnicode_1BYTE_DATA(s),
2199 PyUnicode_1BYTE_DATA(s) + len,
2200 result);
2201 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002202 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002203 default:
2204 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 }
Victor Stinner01698042011-10-04 00:04:26 +02002206 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002207 return NULL;
2208}
2209
2210static Py_UCS4*
2211as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2212 int copy_null)
2213{
2214 int kind;
2215 void *data;
2216 Py_ssize_t len, targetlen;
2217 if (PyUnicode_READY(string) == -1)
2218 return NULL;
2219 kind = PyUnicode_KIND(string);
2220 data = PyUnicode_DATA(string);
2221 len = PyUnicode_GET_LENGTH(string);
2222 targetlen = len;
2223 if (copy_null)
2224 targetlen++;
2225 if (!target) {
2226 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2227 PyErr_NoMemory();
2228 return NULL;
2229 }
2230 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2231 if (!target) {
2232 PyErr_NoMemory();
2233 return NULL;
2234 }
2235 }
2236 else {
2237 if (targetsize < targetlen) {
2238 PyErr_Format(PyExc_SystemError,
2239 "string is longer than the buffer");
2240 if (copy_null && 0 < targetsize)
2241 target[0] = 0;
2242 return NULL;
2243 }
2244 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002245 if (kind == PyUnicode_1BYTE_KIND) {
2246 Py_UCS1 *start = (Py_UCS1 *) data;
2247 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002249 else if (kind == PyUnicode_2BYTE_KIND) {
2250 Py_UCS2 *start = (Py_UCS2 *) data;
2251 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2252 }
2253 else {
2254 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002255 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002257 if (copy_null)
2258 target[len] = 0;
2259 return target;
2260}
2261
2262Py_UCS4*
2263PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2264 int copy_null)
2265{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002266 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267 PyErr_BadInternalCall();
2268 return NULL;
2269 }
2270 return as_ucs4(string, target, targetsize, copy_null);
2271}
2272
2273Py_UCS4*
2274PyUnicode_AsUCS4Copy(PyObject *string)
2275{
2276 return as_ucs4(string, NULL, 0, 1);
2277}
2278
2279#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002280
Alexander Belopolsky40018472011-02-26 01:02:56 +00002281PyObject *
2282PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002285 if (size == 0) {
2286 Py_INCREF(unicode_empty);
2287 return unicode_empty;
2288 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002289 PyErr_BadInternalCall();
2290 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 }
2292
Martin v. Löwis790465f2008-04-05 20:41:37 +00002293 if (size == -1) {
2294 size = wcslen(w);
2295 }
2296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002297 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298}
2299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002300#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002301
Walter Dörwald346737f2007-05-31 10:44:43 +00002302static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002303makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002304 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002305{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002306 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002307 if (longflag)
2308 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002309 else if (longlongflag) {
2310 /* longlongflag should only ever be nonzero on machines with
2311 HAVE_LONG_LONG defined */
2312#ifdef HAVE_LONG_LONG
2313 char *f = PY_FORMAT_LONG_LONG;
2314 while (*f)
2315 *fmt++ = *f++;
2316#else
2317 /* we shouldn't ever get here */
2318 assert(0);
2319 *fmt++ = 'l';
2320#endif
2321 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002322 else if (size_tflag) {
2323 char *f = PY_FORMAT_SIZE_T;
2324 while (*f)
2325 *fmt++ = *f++;
2326 }
2327 *fmt++ = c;
2328 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002329}
2330
Victor Stinner15a11362012-10-06 23:48:20 +02002331/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002332 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2333 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2334#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002335
2336static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002337unicode_fromformat_arg(_PyUnicodeWriter *writer,
2338 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002339{
Victor Stinnere215d962012-10-06 23:03:36 +02002340 const char *p;
2341 Py_ssize_t len;
2342 int zeropad;
2343 int width;
2344 int precision;
2345 int longflag;
2346 int longlongflag;
2347 int size_tflag;
2348 int fill;
2349
2350 p = f;
2351 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002352 zeropad = 0;
2353 if (*f == '0') {
2354 zeropad = 1;
2355 f++;
2356 }
Victor Stinner96865452011-03-01 23:44:09 +00002357
2358 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner96865452011-03-01 23:44:09 +00002359 width = 0;
Victor Stinnere215d962012-10-06 23:03:36 +02002360 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002361 if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2362 PyErr_SetString(PyExc_ValueError,
2363 "width too big");
2364 return NULL;
2365 }
Victor Stinnere215d962012-10-06 23:03:36 +02002366 width = (width*10) + (*f - '0');
2367 f++;
2368 }
Victor Stinner96865452011-03-01 23:44:09 +00002369 precision = 0;
2370 if (*f == '.') {
2371 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002372 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002373 if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2374 PyErr_SetString(PyExc_ValueError,
2375 "precision too big");
2376 return NULL;
2377 }
Victor Stinnere215d962012-10-06 23:03:36 +02002378 precision = (precision*10) + (*f - '0');
2379 f++;
2380 }
Victor Stinner96865452011-03-01 23:44:09 +00002381 if (*f == '%') {
2382 /* "%.3%s" => f points to "3" */
2383 f--;
2384 }
2385 }
2386 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002387 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002388 f--;
2389 }
Victor Stinner96865452011-03-01 23:44:09 +00002390
2391 /* Handle %ld, %lu, %lld and %llu. */
2392 longflag = 0;
2393 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002394 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002395 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002396 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002397 longflag = 1;
2398 ++f;
2399 }
2400#ifdef HAVE_LONG_LONG
2401 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002402 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002403 longlongflag = 1;
2404 f += 2;
2405 }
2406#endif
2407 }
2408 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002409 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002410 size_tflag = 1;
2411 ++f;
2412 }
Victor Stinnere215d962012-10-06 23:03:36 +02002413
2414 if (f[1] == '\0')
2415 writer->overallocate = 0;
2416
2417 switch (*f) {
2418 case 'c':
2419 {
2420 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002421 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2422 PyErr_SetString(PyExc_ValueError,
2423 "character argument not in range(0x110000)");
2424 return NULL;
2425 }
Victor Stinnere215d962012-10-06 23:03:36 +02002426 if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1)
2427 return NULL;
2428 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal);
2429 writer->pos++;
2430 break;
2431 }
2432
2433 case 'i':
2434 case 'd':
2435 case 'u':
2436 case 'x':
2437 {
2438 /* used by sprintf */
2439 char fmt[10]; /* should be enough for "%0lld\0" */
Victor Stinner15a11362012-10-06 23:48:20 +02002440 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinnere215d962012-10-06 23:03:36 +02002441
2442 if (*f == 'u') {
2443 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2444
2445 if (longflag)
2446 len = sprintf(buffer, fmt,
2447 va_arg(*vargs, unsigned long));
2448#ifdef HAVE_LONG_LONG
2449 else if (longlongflag)
2450 len = sprintf(buffer, fmt,
2451 va_arg(*vargs, unsigned PY_LONG_LONG));
2452#endif
2453 else if (size_tflag)
2454 len = sprintf(buffer, fmt,
2455 va_arg(*vargs, size_t));
2456 else
2457 len = sprintf(buffer, fmt,
2458 va_arg(*vargs, unsigned int));
2459 }
2460 else if (*f == 'x') {
2461 makefmt(fmt, 0, 0, 0, 'x');
2462 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2463 }
2464 else {
2465 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2466
2467 if (longflag)
2468 len = sprintf(buffer, fmt,
2469 va_arg(*vargs, long));
2470#ifdef HAVE_LONG_LONG
2471 else if (longlongflag)
2472 len = sprintf(buffer, fmt,
2473 va_arg(*vargs, PY_LONG_LONG));
2474#endif
2475 else if (size_tflag)
2476 len = sprintf(buffer, fmt,
2477 va_arg(*vargs, Py_ssize_t));
2478 else
2479 len = sprintf(buffer, fmt,
2480 va_arg(*vargs, int));
2481 }
2482 assert(len >= 0);
2483
Victor Stinnere215d962012-10-06 23:03:36 +02002484 if (precision < len)
2485 precision = len;
2486 if (width > precision) {
2487 Py_UCS4 fillchar;
2488 fill = width - precision;
2489 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002490 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1)
2491 return NULL;
2492 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2493 return NULL;
2494 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002495 }
Victor Stinner15a11362012-10-06 23:48:20 +02002496 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002497 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002498 if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1)
2499 return NULL;
2500 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2501 return NULL;
2502 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002503 }
Victor Stinner15a11362012-10-06 23:48:20 +02002504 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002505 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002506 break;
2507 }
2508
2509 case 'p':
2510 {
2511 char number[MAX_LONG_LONG_CHARS];
2512
2513 len = sprintf(number, "%p", va_arg(*vargs, void*));
2514 assert(len >= 0);
2515
2516 /* %p is ill-defined: ensure leading 0x. */
2517 if (number[1] == 'X')
2518 number[1] = 'x';
2519 else if (number[1] != 'x') {
2520 memmove(number + 2, number,
2521 strlen(number) + 1);
2522 number[0] = '0';
2523 number[1] = 'x';
2524 len += 2;
2525 }
2526
2527 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2528 return NULL;
2529 break;
2530 }
2531
2532 case 's':
2533 {
2534 /* UTF-8 */
2535 const char *s = va_arg(*vargs, const char*);
2536 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2537 if (!str)
2538 return NULL;
2539 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2540 Py_DECREF(str);
2541 return NULL;
2542 }
2543 Py_DECREF(str);
2544 break;
2545 }
2546
2547 case 'U':
2548 {
2549 PyObject *obj = va_arg(*vargs, PyObject *);
2550 assert(obj && _PyUnicode_CHECK(obj));
2551
2552 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2553 return NULL;
2554 break;
2555 }
2556
2557 case 'V':
2558 {
2559 PyObject *obj = va_arg(*vargs, PyObject *);
2560 const char *str = va_arg(*vargs, const char *);
2561 PyObject *str_obj;
2562 assert(obj || str);
2563 if (obj) {
2564 assert(_PyUnicode_CHECK(obj));
2565 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2566 return NULL;
2567 }
2568 else {
2569 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2570 if (!str_obj)
2571 return NULL;
2572 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2573 Py_DECREF(str_obj);
2574 return NULL;
2575 }
2576 Py_DECREF(str_obj);
2577 }
2578 break;
2579 }
2580
2581 case 'S':
2582 {
2583 PyObject *obj = va_arg(*vargs, PyObject *);
2584 PyObject *str;
2585 assert(obj);
2586 str = PyObject_Str(obj);
2587 if (!str)
2588 return NULL;
2589 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2590 Py_DECREF(str);
2591 return NULL;
2592 }
2593 Py_DECREF(str);
2594 break;
2595 }
2596
2597 case 'R':
2598 {
2599 PyObject *obj = va_arg(*vargs, PyObject *);
2600 PyObject *repr;
2601 assert(obj);
2602 repr = PyObject_Repr(obj);
2603 if (!repr)
2604 return NULL;
2605 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2606 Py_DECREF(repr);
2607 return NULL;
2608 }
2609 Py_DECREF(repr);
2610 break;
2611 }
2612
2613 case 'A':
2614 {
2615 PyObject *obj = va_arg(*vargs, PyObject *);
2616 PyObject *ascii;
2617 assert(obj);
2618 ascii = PyObject_ASCII(obj);
2619 if (!ascii)
2620 return NULL;
2621 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2622 Py_DECREF(ascii);
2623 return NULL;
2624 }
2625 Py_DECREF(ascii);
2626 break;
2627 }
2628
2629 case '%':
2630 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1)
2631 return NULL;
2632 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
2633 writer->pos++;
2634 break;
2635
2636 default:
2637 /* if we stumble upon an unknown formatting code, copy the rest
2638 of the format string to the output string. (we cannot just
2639 skip the code, since there's no way to know what's in the
2640 argument list) */
2641 len = strlen(p);
2642 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2643 return NULL;
2644 f = p+len;
2645 return f;
2646 }
2647
2648 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002649 return f;
2650}
2651
Walter Dörwaldd2034312007-05-18 16:29:38 +00002652PyObject *
2653PyUnicode_FromFormatV(const char *format, va_list vargs)
2654{
Victor Stinnere215d962012-10-06 23:03:36 +02002655 va_list vargs2;
2656 const char *f;
2657 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002658
Victor Stinnere215d962012-10-06 23:03:36 +02002659 _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
2660
2661 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2662 Copy it to be able to pass a reference to a subfunction. */
2663 Py_VA_COPY(vargs2, vargs);
2664
2665 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002666 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002667 f = unicode_fromformat_arg(&writer, f, &vargs2);
2668 if (f == NULL)
2669 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002671 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002672 const char *p;
2673 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002674
Victor Stinnere215d962012-10-06 23:03:36 +02002675 p = f;
2676 do
2677 {
2678 if ((unsigned char)*p > 127) {
2679 PyErr_Format(PyExc_ValueError,
2680 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2681 "string, got a non-ASCII byte: 0x%02x",
2682 (unsigned char)*p);
2683 return NULL;
2684 }
2685 p++;
2686 }
2687 while (*p != '\0' && *p != '%');
2688 len = p - f;
2689
2690 if (*p == '\0')
2691 writer.overallocate = 0;
2692 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2693 goto fail;
2694 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2695 writer.pos += len;
2696
2697 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002698 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002699 }
Victor Stinnere215d962012-10-06 23:03:36 +02002700 return _PyUnicodeWriter_Finish(&writer);
2701
2702 fail:
2703 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002704 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002705}
2706
Walter Dörwaldd2034312007-05-18 16:29:38 +00002707PyObject *
2708PyUnicode_FromFormat(const char *format, ...)
2709{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 PyObject* ret;
2711 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002712
2713#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002714 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002715#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002716 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002717#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002718 ret = PyUnicode_FromFormatV(format, vargs);
2719 va_end(vargs);
2720 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002721}
2722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002723#ifdef HAVE_WCHAR_H
2724
Victor Stinner5593d8a2010-10-02 11:11:27 +00002725/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2726 convert a Unicode object to a wide character string.
2727
Victor Stinnerd88d9832011-09-06 02:00:05 +02002728 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002729 character) required to convert the unicode object. Ignore size argument.
2730
Victor Stinnerd88d9832011-09-06 02:00:05 +02002731 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002732 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002733 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002734static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002735unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002736 wchar_t *w,
2737 Py_ssize_t size)
2738{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002739 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002740 const wchar_t *wstr;
2741
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002742 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002743 if (wstr == NULL)
2744 return -1;
2745
Victor Stinner5593d8a2010-10-02 11:11:27 +00002746 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002747 if (size > res)
2748 size = res + 1;
2749 else
2750 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002752 return res;
2753 }
2754 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002755 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002756}
2757
2758Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002759PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002760 wchar_t *w,
2761 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002762{
2763 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002764 PyErr_BadInternalCall();
2765 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002767 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002768}
2769
Victor Stinner137c34c2010-09-29 10:25:54 +00002770wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002771PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002772 Py_ssize_t *size)
2773{
2774 wchar_t* buffer;
2775 Py_ssize_t buflen;
2776
2777 if (unicode == NULL) {
2778 PyErr_BadInternalCall();
2779 return NULL;
2780 }
2781
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002782 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002783 if (buflen == -1)
2784 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002785 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002786 PyErr_NoMemory();
2787 return NULL;
2788 }
2789
Victor Stinner137c34c2010-09-29 10:25:54 +00002790 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2791 if (buffer == NULL) {
2792 PyErr_NoMemory();
2793 return NULL;
2794 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002795 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002796 if (buflen == -1) {
2797 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002798 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002799 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002800 if (size != NULL)
2801 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002802 return buffer;
2803}
2804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002805#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806
Alexander Belopolsky40018472011-02-26 01:02:56 +00002807PyObject *
2808PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002809{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002811 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002812 PyErr_SetString(PyExc_ValueError,
2813 "chr() arg not in range(0x110000)");
2814 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002815 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002817 if (ordinal < 256)
2818 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002820 v = PyUnicode_New(1, ordinal);
2821 if (v == NULL)
2822 return NULL;
2823 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002824 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002825 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002826}
2827
Alexander Belopolsky40018472011-02-26 01:02:56 +00002828PyObject *
2829PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002831 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002833 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002834 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002835 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002836 Py_INCREF(obj);
2837 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002838 }
2839 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002840 /* For a Unicode subtype that's not a Unicode object,
2841 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002842 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002843 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002844 PyErr_Format(PyExc_TypeError,
2845 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002846 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002847 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002848}
2849
Alexander Belopolsky40018472011-02-26 01:02:56 +00002850PyObject *
2851PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002852 const char *encoding,
2853 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002854{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002855 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002856 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002857
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002859 PyErr_BadInternalCall();
2860 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002862
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002863 /* Decoding bytes objects is the most common case and should be fast */
2864 if (PyBytes_Check(obj)) {
2865 if (PyBytes_GET_SIZE(obj) == 0) {
2866 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002867 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002868 }
2869 else {
2870 v = PyUnicode_Decode(
2871 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2872 encoding, errors);
2873 }
2874 return v;
2875 }
2876
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002877 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002878 PyErr_SetString(PyExc_TypeError,
2879 "decoding str is not supported");
2880 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002881 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002882
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002883 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2884 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2885 PyErr_Format(PyExc_TypeError,
2886 "coercing to str: need bytes, bytearray "
2887 "or buffer-like object, %.80s found",
2888 Py_TYPE(obj)->tp_name);
2889 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002890 }
Tim Petersced69f82003-09-16 20:30:58 +00002891
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002892 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002893 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002894 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895 }
Tim Petersced69f82003-09-16 20:30:58 +00002896 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002897 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002898
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002899 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002900 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901}
2902
Victor Stinner600d3be2010-06-10 12:00:55 +00002903/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002904 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2905 1 on success. */
2906static int
2907normalize_encoding(const char *encoding,
2908 char *lower,
2909 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002911 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002912 char *l;
2913 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002914
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002915 if (encoding == NULL) {
2916 strcpy(lower, "utf-8");
2917 return 1;
2918 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002919 e = encoding;
2920 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002921 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002922 while (*e) {
2923 if (l == l_end)
2924 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002925 if (Py_ISUPPER(*e)) {
2926 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002927 }
2928 else if (*e == '_') {
2929 *l++ = '-';
2930 e++;
2931 }
2932 else {
2933 *l++ = *e++;
2934 }
2935 }
2936 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002937 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002938}
2939
Alexander Belopolsky40018472011-02-26 01:02:56 +00002940PyObject *
2941PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002942 Py_ssize_t size,
2943 const char *encoding,
2944 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002945{
2946 PyObject *buffer = NULL, *unicode;
2947 Py_buffer info;
2948 char lower[11]; /* Enough for any encoding shortcut */
2949
Fred Drakee4315f52000-05-09 19:53:39 +00002950 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002951 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002952 if ((strcmp(lower, "utf-8") == 0) ||
2953 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002954 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002955 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002956 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002957 (strcmp(lower, "iso-8859-1") == 0))
2958 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002959#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002960 else if (strcmp(lower, "mbcs") == 0)
2961 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002962#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002963 else if (strcmp(lower, "ascii") == 0)
2964 return PyUnicode_DecodeASCII(s, size, errors);
2965 else if (strcmp(lower, "utf-16") == 0)
2966 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2967 else if (strcmp(lower, "utf-32") == 0)
2968 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2969 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970
2971 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002972 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002973 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002974 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002975 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976 if (buffer == NULL)
2977 goto onError;
2978 unicode = PyCodec_Decode(buffer, encoding, errors);
2979 if (unicode == NULL)
2980 goto onError;
2981 if (!PyUnicode_Check(unicode)) {
2982 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002983 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002984 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985 Py_DECREF(unicode);
2986 goto onError;
2987 }
2988 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002989 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002990
Benjamin Peterson29060642009-01-31 22:14:21 +00002991 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992 Py_XDECREF(buffer);
2993 return NULL;
2994}
2995
Alexander Belopolsky40018472011-02-26 01:02:56 +00002996PyObject *
2997PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002998 const char *encoding,
2999 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003000{
3001 PyObject *v;
3002
3003 if (!PyUnicode_Check(unicode)) {
3004 PyErr_BadArgument();
3005 goto onError;
3006 }
3007
3008 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003009 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003010
3011 /* Decode via the codec registry */
3012 v = PyCodec_Decode(unicode, encoding, errors);
3013 if (v == NULL)
3014 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003015 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003016
Benjamin Peterson29060642009-01-31 22:14:21 +00003017 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003018 return NULL;
3019}
3020
Alexander Belopolsky40018472011-02-26 01:02:56 +00003021PyObject *
3022PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003023 const char *encoding,
3024 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003025{
3026 PyObject *v;
3027
3028 if (!PyUnicode_Check(unicode)) {
3029 PyErr_BadArgument();
3030 goto onError;
3031 }
3032
3033 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003034 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003035
3036 /* Decode via the codec registry */
3037 v = PyCodec_Decode(unicode, encoding, errors);
3038 if (v == NULL)
3039 goto onError;
3040 if (!PyUnicode_Check(v)) {
3041 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003042 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003043 Py_TYPE(v)->tp_name);
3044 Py_DECREF(v);
3045 goto onError;
3046 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003047 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003048
Benjamin Peterson29060642009-01-31 22:14:21 +00003049 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003050 return NULL;
3051}
3052
Alexander Belopolsky40018472011-02-26 01:02:56 +00003053PyObject *
3054PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003055 Py_ssize_t size,
3056 const char *encoding,
3057 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003058{
3059 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003060
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061 unicode = PyUnicode_FromUnicode(s, size);
3062 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003063 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003064 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3065 Py_DECREF(unicode);
3066 return v;
3067}
3068
Alexander Belopolsky40018472011-02-26 01:02:56 +00003069PyObject *
3070PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003071 const char *encoding,
3072 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003073{
3074 PyObject *v;
3075
3076 if (!PyUnicode_Check(unicode)) {
3077 PyErr_BadArgument();
3078 goto onError;
3079 }
3080
3081 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003082 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003083
3084 /* Encode via the codec registry */
3085 v = PyCodec_Encode(unicode, encoding, errors);
3086 if (v == NULL)
3087 goto onError;
3088 return v;
3089
Benjamin Peterson29060642009-01-31 22:14:21 +00003090 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003091 return NULL;
3092}
3093
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003094static size_t
3095wcstombs_errorpos(const wchar_t *wstr)
3096{
3097 size_t len;
3098#if SIZEOF_WCHAR_T == 2
3099 wchar_t buf[3];
3100#else
3101 wchar_t buf[2];
3102#endif
3103 char outbuf[MB_LEN_MAX];
3104 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003105
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003106#if SIZEOF_WCHAR_T == 2
3107 buf[2] = 0;
3108#else
3109 buf[1] = 0;
3110#endif
3111 start = wstr;
3112 while (*wstr != L'\0')
3113 {
3114 previous = wstr;
3115#if SIZEOF_WCHAR_T == 2
3116 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3117 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3118 {
3119 buf[0] = wstr[0];
3120 buf[1] = wstr[1];
3121 wstr += 2;
3122 }
3123 else {
3124 buf[0] = *wstr;
3125 buf[1] = 0;
3126 wstr++;
3127 }
3128#else
3129 buf[0] = *wstr;
3130 wstr++;
3131#endif
3132 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003133 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003134 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003135 }
3136
3137 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003138 return 0;
3139}
3140
Victor Stinner1b579672011-12-17 05:47:23 +01003141static int
3142locale_error_handler(const char *errors, int *surrogateescape)
3143{
3144 if (errors == NULL) {
3145 *surrogateescape = 0;
3146 return 0;
3147 }
3148
3149 if (strcmp(errors, "strict") == 0) {
3150 *surrogateescape = 0;
3151 return 0;
3152 }
3153 if (strcmp(errors, "surrogateescape") == 0) {
3154 *surrogateescape = 1;
3155 return 0;
3156 }
3157 PyErr_Format(PyExc_ValueError,
3158 "only 'strict' and 'surrogateescape' error handlers "
3159 "are supported, not '%s'",
3160 errors);
3161 return -1;
3162}
3163
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003164PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003165PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003166{
3167 Py_ssize_t wlen, wlen2;
3168 wchar_t *wstr;
3169 PyObject *bytes = NULL;
3170 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003171 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003172 PyObject *exc;
3173 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003174 int surrogateescape;
3175
3176 if (locale_error_handler(errors, &surrogateescape) < 0)
3177 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003178
3179 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3180 if (wstr == NULL)
3181 return NULL;
3182
3183 wlen2 = wcslen(wstr);
3184 if (wlen2 != wlen) {
3185 PyMem_Free(wstr);
3186 PyErr_SetString(PyExc_TypeError, "embedded null character");
3187 return NULL;
3188 }
3189
3190 if (surrogateescape) {
3191 /* locale encoding with surrogateescape */
3192 char *str;
3193
3194 str = _Py_wchar2char(wstr, &error_pos);
3195 if (str == NULL) {
3196 if (error_pos == (size_t)-1) {
3197 PyErr_NoMemory();
3198 PyMem_Free(wstr);
3199 return NULL;
3200 }
3201 else {
3202 goto encode_error;
3203 }
3204 }
3205 PyMem_Free(wstr);
3206
3207 bytes = PyBytes_FromString(str);
3208 PyMem_Free(str);
3209 }
3210 else {
3211 size_t len, len2;
3212
3213 len = wcstombs(NULL, wstr, 0);
3214 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003215 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003216 goto encode_error;
3217 }
3218
3219 bytes = PyBytes_FromStringAndSize(NULL, len);
3220 if (bytes == NULL) {
3221 PyMem_Free(wstr);
3222 return NULL;
3223 }
3224
3225 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3226 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003227 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003228 goto encode_error;
3229 }
3230 PyMem_Free(wstr);
3231 }
3232 return bytes;
3233
3234encode_error:
3235 errmsg = strerror(errno);
3236 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003237
3238 if (error_pos == (size_t)-1)
3239 error_pos = wcstombs_errorpos(wstr);
3240
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003241 PyMem_Free(wstr);
3242 Py_XDECREF(bytes);
3243
Victor Stinner2f197072011-12-17 07:08:30 +01003244 if (errmsg != NULL) {
3245 size_t errlen;
3246 wstr = _Py_char2wchar(errmsg, &errlen);
3247 if (wstr != NULL) {
3248 reason = PyUnicode_FromWideChar(wstr, errlen);
3249 PyMem_Free(wstr);
3250 } else
3251 errmsg = NULL;
3252 }
3253 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003254 reason = PyUnicode_FromString(
3255 "wcstombs() encountered an unencodable "
3256 "wide character");
3257 if (reason == NULL)
3258 return NULL;
3259
3260 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3261 "locale", unicode,
3262 (Py_ssize_t)error_pos,
3263 (Py_ssize_t)(error_pos+1),
3264 reason);
3265 Py_DECREF(reason);
3266 if (exc != NULL) {
3267 PyCodec_StrictErrors(exc);
3268 Py_XDECREF(exc);
3269 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003270 return NULL;
3271}
3272
Victor Stinnerad158722010-10-27 00:25:46 +00003273PyObject *
3274PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003275{
Victor Stinner99b95382011-07-04 14:23:54 +02003276#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003277 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003278#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003279 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003280#else
Victor Stinner793b5312011-04-27 00:24:21 +02003281 PyInterpreterState *interp = PyThreadState_GET()->interp;
3282 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3283 cannot use it to encode and decode filenames before it is loaded. Load
3284 the Python codec requires to encode at least its own filename. Use the C
3285 version of the locale codec until the codec registry is initialized and
3286 the Python codec is loaded.
3287
3288 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3289 cannot only rely on it: check also interp->fscodec_initialized for
3290 subinterpreters. */
3291 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003292 return PyUnicode_AsEncodedString(unicode,
3293 Py_FileSystemDefaultEncoding,
3294 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003295 }
3296 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003297 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003298 }
Victor Stinnerad158722010-10-27 00:25:46 +00003299#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003300}
3301
Alexander Belopolsky40018472011-02-26 01:02:56 +00003302PyObject *
3303PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003304 const char *encoding,
3305 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306{
3307 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003308 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003309
Guido van Rossumd57fd912000-03-10 22:53:23 +00003310 if (!PyUnicode_Check(unicode)) {
3311 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003312 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 }
Fred Drakee4315f52000-05-09 19:53:39 +00003314
Fred Drakee4315f52000-05-09 19:53:39 +00003315 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003316 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003317 if ((strcmp(lower, "utf-8") == 0) ||
3318 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003319 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003320 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003321 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003322 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003323 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003324 }
Victor Stinner37296e82010-06-10 13:36:23 +00003325 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003326 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003327 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003328 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003329#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003330 else if (strcmp(lower, "mbcs") == 0)
3331 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003332#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003333 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003334 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003335 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336
3337 /* Encode via the codec registry */
3338 v = PyCodec_Encode(unicode, encoding, errors);
3339 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003340 return NULL;
3341
3342 /* The normal path */
3343 if (PyBytes_Check(v))
3344 return v;
3345
3346 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003347 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003348 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003349 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003350
3351 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3352 "encoder %s returned bytearray instead of bytes",
3353 encoding);
3354 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003355 Py_DECREF(v);
3356 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003357 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003358
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003359 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3360 Py_DECREF(v);
3361 return b;
3362 }
3363
3364 PyErr_Format(PyExc_TypeError,
3365 "encoder did not return a bytes object (type=%.400s)",
3366 Py_TYPE(v)->tp_name);
3367 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003368 return NULL;
3369}
3370
Alexander Belopolsky40018472011-02-26 01:02:56 +00003371PyObject *
3372PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003373 const char *encoding,
3374 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003375{
3376 PyObject *v;
3377
3378 if (!PyUnicode_Check(unicode)) {
3379 PyErr_BadArgument();
3380 goto onError;
3381 }
3382
3383 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003384 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003385
3386 /* Encode via the codec registry */
3387 v = PyCodec_Encode(unicode, encoding, errors);
3388 if (v == NULL)
3389 goto onError;
3390 if (!PyUnicode_Check(v)) {
3391 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003392 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003393 Py_TYPE(v)->tp_name);
3394 Py_DECREF(v);
3395 goto onError;
3396 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003398
Benjamin Peterson29060642009-01-31 22:14:21 +00003399 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400 return NULL;
3401}
3402
Victor Stinner2f197072011-12-17 07:08:30 +01003403static size_t
3404mbstowcs_errorpos(const char *str, size_t len)
3405{
3406#ifdef HAVE_MBRTOWC
3407 const char *start = str;
3408 mbstate_t mbs;
3409 size_t converted;
3410 wchar_t ch;
3411
3412 memset(&mbs, 0, sizeof mbs);
3413 while (len)
3414 {
3415 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3416 if (converted == 0)
3417 /* Reached end of string */
3418 break;
3419 if (converted == (size_t)-1 || converted == (size_t)-2) {
3420 /* Conversion error or incomplete character */
3421 return str - start;
3422 }
3423 else {
3424 str += converted;
3425 len -= converted;
3426 }
3427 }
3428 /* failed to find the undecodable byte sequence */
3429 return 0;
3430#endif
3431 return 0;
3432}
3433
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003434PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003435PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003436 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003437{
3438 wchar_t smallbuf[256];
3439 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3440 wchar_t *wstr;
3441 size_t wlen, wlen2;
3442 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003443 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003444 size_t error_pos;
3445 char *errmsg;
3446 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003447
3448 if (locale_error_handler(errors, &surrogateescape) < 0)
3449 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003450
3451 if (str[len] != '\0' || len != strlen(str)) {
3452 PyErr_SetString(PyExc_TypeError, "embedded null character");
3453 return NULL;
3454 }
3455
3456 if (surrogateescape)
3457 {
3458 wstr = _Py_char2wchar(str, &wlen);
3459 if (wstr == NULL) {
3460 if (wlen == (size_t)-1)
3461 PyErr_NoMemory();
3462 else
3463 PyErr_SetFromErrno(PyExc_OSError);
3464 return NULL;
3465 }
3466
3467 unicode = PyUnicode_FromWideChar(wstr, wlen);
3468 PyMem_Free(wstr);
3469 }
3470 else {
3471#ifndef HAVE_BROKEN_MBSTOWCS
3472 wlen = mbstowcs(NULL, str, 0);
3473#else
3474 wlen = len;
3475#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003476 if (wlen == (size_t)-1)
3477 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003478 if (wlen+1 <= smallbuf_len) {
3479 wstr = smallbuf;
3480 }
3481 else {
3482 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3483 return PyErr_NoMemory();
3484
3485 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3486 if (!wstr)
3487 return PyErr_NoMemory();
3488 }
3489
3490 /* This shouldn't fail now */
3491 wlen2 = mbstowcs(wstr, str, wlen+1);
3492 if (wlen2 == (size_t)-1) {
3493 if (wstr != smallbuf)
3494 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003495 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003496 }
3497#ifdef HAVE_BROKEN_MBSTOWCS
3498 assert(wlen2 == wlen);
3499#endif
3500 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3501 if (wstr != smallbuf)
3502 PyMem_Free(wstr);
3503 }
3504 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003505
3506decode_error:
3507 errmsg = strerror(errno);
3508 assert(errmsg != NULL);
3509
3510 error_pos = mbstowcs_errorpos(str, len);
3511 if (errmsg != NULL) {
3512 size_t errlen;
3513 wstr = _Py_char2wchar(errmsg, &errlen);
3514 if (wstr != NULL) {
3515 reason = PyUnicode_FromWideChar(wstr, errlen);
3516 PyMem_Free(wstr);
3517 } else
3518 errmsg = NULL;
3519 }
3520 if (errmsg == NULL)
3521 reason = PyUnicode_FromString(
3522 "mbstowcs() encountered an invalid multibyte sequence");
3523 if (reason == NULL)
3524 return NULL;
3525
3526 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3527 "locale", str, len,
3528 (Py_ssize_t)error_pos,
3529 (Py_ssize_t)(error_pos+1),
3530 reason);
3531 Py_DECREF(reason);
3532 if (exc != NULL) {
3533 PyCodec_StrictErrors(exc);
3534 Py_XDECREF(exc);
3535 }
3536 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003537}
3538
3539PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003540PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003541{
3542 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003543 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003544}
3545
3546
3547PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003548PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003549 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003550 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3551}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003552
Christian Heimes5894ba72007-11-04 11:43:14 +00003553PyObject*
3554PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3555{
Victor Stinner99b95382011-07-04 14:23:54 +02003556#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003557 return PyUnicode_DecodeMBCS(s, size, NULL);
3558#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003559 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003560#else
Victor Stinner793b5312011-04-27 00:24:21 +02003561 PyInterpreterState *interp = PyThreadState_GET()->interp;
3562 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3563 cannot use it to encode and decode filenames before it is loaded. Load
3564 the Python codec requires to encode at least its own filename. Use the C
3565 version of the locale codec until the codec registry is initialized and
3566 the Python codec is loaded.
3567
3568 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3569 cannot only rely on it: check also interp->fscodec_initialized for
3570 subinterpreters. */
3571 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003572 return PyUnicode_Decode(s, size,
3573 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003574 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003575 }
3576 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003577 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003578 }
Victor Stinnerad158722010-10-27 00:25:46 +00003579#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003580}
3581
Martin v. Löwis011e8422009-05-05 04:43:17 +00003582
3583int
Antoine Pitrou13348842012-01-29 18:36:34 +01003584_PyUnicode_HasNULChars(PyObject* s)
3585{
3586 static PyObject *nul = NULL;
3587
3588 if (nul == NULL)
3589 nul = PyUnicode_FromStringAndSize("\0", 1);
3590 if (nul == NULL)
3591 return -1;
3592 return PyUnicode_Contains(s, nul);
3593}
3594
3595
3596int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003597PyUnicode_FSConverter(PyObject* arg, void* addr)
3598{
3599 PyObject *output = NULL;
3600 Py_ssize_t size;
3601 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003602 if (arg == NULL) {
3603 Py_DECREF(*(PyObject**)addr);
3604 return 1;
3605 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003606 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003607 output = arg;
3608 Py_INCREF(output);
3609 }
3610 else {
3611 arg = PyUnicode_FromObject(arg);
3612 if (!arg)
3613 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003614 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003615 Py_DECREF(arg);
3616 if (!output)
3617 return 0;
3618 if (!PyBytes_Check(output)) {
3619 Py_DECREF(output);
3620 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3621 return 0;
3622 }
3623 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003624 size = PyBytes_GET_SIZE(output);
3625 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003626 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003627 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003628 Py_DECREF(output);
3629 return 0;
3630 }
3631 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003632 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003633}
3634
3635
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003636int
3637PyUnicode_FSDecoder(PyObject* arg, void* addr)
3638{
3639 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003640 if (arg == NULL) {
3641 Py_DECREF(*(PyObject**)addr);
3642 return 1;
3643 }
3644 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003645 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003646 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003647 output = arg;
3648 Py_INCREF(output);
3649 }
3650 else {
3651 arg = PyBytes_FromObject(arg);
3652 if (!arg)
3653 return 0;
3654 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3655 PyBytes_GET_SIZE(arg));
3656 Py_DECREF(arg);
3657 if (!output)
3658 return 0;
3659 if (!PyUnicode_Check(output)) {
3660 Py_DECREF(output);
3661 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3662 return 0;
3663 }
3664 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003665 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003666 Py_DECREF(output);
3667 return 0;
3668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003669 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003670 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003671 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3672 Py_DECREF(output);
3673 return 0;
3674 }
3675 *(PyObject**)addr = output;
3676 return Py_CLEANUP_SUPPORTED;
3677}
3678
3679
Martin v. Löwis5b222132007-06-10 09:51:05 +00003680char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003681PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003682{
Christian Heimesf3863112007-11-22 07:46:41 +00003683 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003684
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003685 if (!PyUnicode_Check(unicode)) {
3686 PyErr_BadArgument();
3687 return NULL;
3688 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003689 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003690 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003691
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003692 if (PyUnicode_UTF8(unicode) == NULL) {
3693 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003694 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3695 if (bytes == NULL)
3696 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003697 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3698 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003699 Py_DECREF(bytes);
3700 return NULL;
3701 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003702 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3703 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3704 PyBytes_AS_STRING(bytes),
3705 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003706 Py_DECREF(bytes);
3707 }
3708
3709 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003710 *psize = PyUnicode_UTF8_LENGTH(unicode);
3711 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003712}
3713
3714char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003715PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003716{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003717 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3718}
3719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003720Py_UNICODE *
3721PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3722{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003723 const unsigned char *one_byte;
3724#if SIZEOF_WCHAR_T == 4
3725 const Py_UCS2 *two_bytes;
3726#else
3727 const Py_UCS4 *four_bytes;
3728 const Py_UCS4 *ucs4_end;
3729 Py_ssize_t num_surrogates;
3730#endif
3731 wchar_t *w;
3732 wchar_t *wchar_end;
3733
3734 if (!PyUnicode_Check(unicode)) {
3735 PyErr_BadArgument();
3736 return NULL;
3737 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003738 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003739 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003740 assert(_PyUnicode_KIND(unicode) != 0);
3741 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003742
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003743 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003744#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003745 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3746 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003747 num_surrogates = 0;
3748
3749 for (; four_bytes < ucs4_end; ++four_bytes) {
3750 if (*four_bytes > 0xFFFF)
3751 ++num_surrogates;
3752 }
3753
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003754 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3755 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3756 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003757 PyErr_NoMemory();
3758 return NULL;
3759 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003760 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003762 w = _PyUnicode_WSTR(unicode);
3763 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3764 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003765 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3766 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003767 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003768 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003769 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3770 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 }
3772 else
3773 *w = *four_bytes;
3774
3775 if (w > wchar_end) {
3776 assert(0 && "Miscalculated string end");
3777 }
3778 }
3779 *w = 0;
3780#else
3781 /* sizeof(wchar_t) == 4 */
3782 Py_FatalError("Impossible unicode object state, wstr and str "
3783 "should share memory already.");
3784 return NULL;
3785#endif
3786 }
3787 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003788 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3789 (_PyUnicode_LENGTH(unicode) + 1));
3790 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003791 PyErr_NoMemory();
3792 return NULL;
3793 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003794 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3795 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3796 w = _PyUnicode_WSTR(unicode);
3797 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003798
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003799 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3800 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003801 for (; w < wchar_end; ++one_byte, ++w)
3802 *w = *one_byte;
3803 /* null-terminate the wstr */
3804 *w = 0;
3805 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003806 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003807#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003808 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003809 for (; w < wchar_end; ++two_bytes, ++w)
3810 *w = *two_bytes;
3811 /* null-terminate the wstr */
3812 *w = 0;
3813#else
3814 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003815 PyObject_FREE(_PyUnicode_WSTR(unicode));
3816 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003817 Py_FatalError("Impossible unicode object state, wstr "
3818 "and str should share memory already.");
3819 return NULL;
3820#endif
3821 }
3822 else {
3823 assert(0 && "This should never happen.");
3824 }
3825 }
3826 }
3827 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003828 *size = PyUnicode_WSTR_LENGTH(unicode);
3829 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003830}
3831
Alexander Belopolsky40018472011-02-26 01:02:56 +00003832Py_UNICODE *
3833PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836}
3837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003838
Alexander Belopolsky40018472011-02-26 01:02:56 +00003839Py_ssize_t
3840PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003841{
3842 if (!PyUnicode_Check(unicode)) {
3843 PyErr_BadArgument();
3844 goto onError;
3845 }
3846 return PyUnicode_GET_SIZE(unicode);
3847
Benjamin Peterson29060642009-01-31 22:14:21 +00003848 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849 return -1;
3850}
3851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003852Py_ssize_t
3853PyUnicode_GetLength(PyObject *unicode)
3854{
Victor Stinner07621332012-06-16 04:53:46 +02003855 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003856 PyErr_BadArgument();
3857 return -1;
3858 }
Victor Stinner07621332012-06-16 04:53:46 +02003859 if (PyUnicode_READY(unicode) == -1)
3860 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003861 return PyUnicode_GET_LENGTH(unicode);
3862}
3863
3864Py_UCS4
3865PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3866{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003867 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3868 PyErr_BadArgument();
3869 return (Py_UCS4)-1;
3870 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003871 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003872 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003873 return (Py_UCS4)-1;
3874 }
3875 return PyUnicode_READ_CHAR(unicode, index);
3876}
3877
3878int
3879PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3880{
3881 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003882 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003883 return -1;
3884 }
Victor Stinner488fa492011-12-12 00:01:39 +01003885 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003886 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003887 PyErr_SetString(PyExc_IndexError, "string index out of range");
3888 return -1;
3889 }
Victor Stinner488fa492011-12-12 00:01:39 +01003890 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003891 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003892 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3893 PyErr_SetString(PyExc_ValueError, "character out of range");
3894 return -1;
3895 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003896 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3897 index, ch);
3898 return 0;
3899}
3900
Alexander Belopolsky40018472011-02-26 01:02:56 +00003901const char *
3902PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003903{
Victor Stinner42cb4622010-09-01 19:39:01 +00003904 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003905}
3906
Victor Stinner554f3f02010-06-16 23:33:54 +00003907/* create or adjust a UnicodeDecodeError */
3908static void
3909make_decode_exception(PyObject **exceptionObject,
3910 const char *encoding,
3911 const char *input, Py_ssize_t length,
3912 Py_ssize_t startpos, Py_ssize_t endpos,
3913 const char *reason)
3914{
3915 if (*exceptionObject == NULL) {
3916 *exceptionObject = PyUnicodeDecodeError_Create(
3917 encoding, input, length, startpos, endpos, reason);
3918 }
3919 else {
3920 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3921 goto onError;
3922 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3923 goto onError;
3924 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3925 goto onError;
3926 }
3927 return;
3928
3929onError:
3930 Py_DECREF(*exceptionObject);
3931 *exceptionObject = NULL;
3932}
3933
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003934/* error handling callback helper:
3935 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003936 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003937 and adjust various state variables.
3938 return 0 on success, -1 on error
3939*/
3940
Alexander Belopolsky40018472011-02-26 01:02:56 +00003941static int
3942unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003943 const char *encoding, const char *reason,
3944 const char **input, const char **inend, Py_ssize_t *startinpos,
3945 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003946 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003947{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003948 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003949
3950 PyObject *restuple = NULL;
3951 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003952 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003953 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003954 Py_ssize_t requiredsize;
3955 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003956 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 int res = -1;
3958
Victor Stinner596a6c42011-11-09 00:02:18 +01003959 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3960 outsize = PyUnicode_GET_LENGTH(*output);
3961 else
3962 outsize = _PyUnicode_WSTR_LENGTH(*output);
3963
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003965 *errorHandler = PyCodec_LookupError(errors);
3966 if (*errorHandler == NULL)
3967 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003968 }
3969
Victor Stinner554f3f02010-06-16 23:33:54 +00003970 make_decode_exception(exceptionObject,
3971 encoding,
3972 *input, *inend - *input,
3973 *startinpos, *endinpos,
3974 reason);
3975 if (*exceptionObject == NULL)
3976 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977
3978 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3979 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003980 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003982 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003983 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984 }
3985 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003986 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05003987 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003988 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003989
3990 /* Copy back the bytes variables, which might have been modified by the
3991 callback */
3992 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3993 if (!inputobj)
3994 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003995 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003996 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003997 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003998 *input = PyBytes_AS_STRING(inputobj);
3999 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004000 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004001 /* we can DECREF safely, as the exception has another reference,
4002 so the object won't go away. */
4003 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004004
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004006 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004007 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004008 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4009 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004010 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011
Victor Stinner596a6c42011-11-09 00:02:18 +01004012 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4013 /* need more space? (at least enough for what we
4014 have+the replacement+the rest of the string (starting
4015 at the new input position), so we won't have to check space
4016 when there are no errors in the rest of the string) */
4017 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4018 requiredsize = *outpos + replen + insize-newpos;
4019 if (requiredsize > outsize) {
4020 if (requiredsize<2*outsize)
4021 requiredsize = 2*outsize;
4022 if (unicode_resize(output, requiredsize) < 0)
4023 goto onError;
4024 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004025 if (unicode_widen(output, *outpos,
4026 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004027 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004028 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004029 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004030 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004031 else {
4032 wchar_t *repwstr;
4033 Py_ssize_t repwlen;
4034 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4035 if (repwstr == NULL)
4036 goto onError;
4037 /* need more space? (at least enough for what we
4038 have+the replacement+the rest of the string (starting
4039 at the new input position), so we won't have to check space
4040 when there are no errors in the rest of the string) */
4041 requiredsize = *outpos + repwlen + insize-newpos;
4042 if (requiredsize > outsize) {
4043 if (requiredsize < 2*outsize)
4044 requiredsize = 2*outsize;
4045 if (unicode_resize(output, requiredsize) < 0)
4046 goto onError;
4047 }
4048 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4049 *outpos += repwlen;
4050 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004051 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004052 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004053
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004054 /* we made it! */
4055 res = 0;
4056
Benjamin Peterson29060642009-01-31 22:14:21 +00004057 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004058 Py_XDECREF(restuple);
4059 return res;
4060}
4061
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004062/* --- UTF-7 Codec -------------------------------------------------------- */
4063
Antoine Pitrou244651a2009-05-04 18:56:13 +00004064/* See RFC2152 for details. We encode conservatively and decode liberally. */
4065
4066/* Three simple macros defining base-64. */
4067
4068/* Is c a base-64 character? */
4069
4070#define IS_BASE64(c) \
4071 (((c) >= 'A' && (c) <= 'Z') || \
4072 ((c) >= 'a' && (c) <= 'z') || \
4073 ((c) >= '0' && (c) <= '9') || \
4074 (c) == '+' || (c) == '/')
4075
4076/* given that c is a base-64 character, what is its base-64 value? */
4077
4078#define FROM_BASE64(c) \
4079 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4080 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4081 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4082 (c) == '+' ? 62 : 63)
4083
4084/* What is the base-64 character of the bottom 6 bits of n? */
4085
4086#define TO_BASE64(n) \
4087 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4088
4089/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4090 * decoded as itself. We are permissive on decoding; the only ASCII
4091 * byte not decoding to itself is the + which begins a base64
4092 * string. */
4093
4094#define DECODE_DIRECT(c) \
4095 ((c) <= 127 && (c) != '+')
4096
4097/* The UTF-7 encoder treats ASCII characters differently according to
4098 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4099 * the above). See RFC2152. This array identifies these different
4100 * sets:
4101 * 0 : "Set D"
4102 * alphanumeric and '(),-./:?
4103 * 1 : "Set O"
4104 * !"#$%&*;<=>@[]^_`{|}
4105 * 2 : "whitespace"
4106 * ht nl cr sp
4107 * 3 : special (must be base64 encoded)
4108 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4109 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004110
Tim Petersced69f82003-09-16 20:30:58 +00004111static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004112char utf7_category[128] = {
4113/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4114 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4115/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4116 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4117/* sp ! " # $ % & ' ( ) * + , - . / */
4118 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4119/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4121/* @ A B C D E F G H I J K L M N O */
4122 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4123/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4124 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4125/* ` a b c d e f g h i j k l m n o */
4126 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4127/* p q r s t u v w x y z { | } ~ del */
4128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004129};
4130
Antoine Pitrou244651a2009-05-04 18:56:13 +00004131/* ENCODE_DIRECT: this character should be encoded as itself. The
4132 * answer depends on whether we are encoding set O as itself, and also
4133 * on whether we are encoding whitespace as itself. RFC2152 makes it
4134 * clear that the answers to these questions vary between
4135 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004136
Antoine Pitrou244651a2009-05-04 18:56:13 +00004137#define ENCODE_DIRECT(c, directO, directWS) \
4138 ((c) < 128 && (c) > 0 && \
4139 ((utf7_category[(c)] == 0) || \
4140 (directWS && (utf7_category[(c)] == 2)) || \
4141 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004142
Alexander Belopolsky40018472011-02-26 01:02:56 +00004143PyObject *
4144PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004145 Py_ssize_t size,
4146 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004147{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004148 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4149}
4150
Antoine Pitrou244651a2009-05-04 18:56:13 +00004151/* The decoder. The only state we preserve is our read position,
4152 * i.e. how many characters we have consumed. So if we end in the
4153 * middle of a shift sequence we have to back off the read position
4154 * and the output to the beginning of the sequence, otherwise we lose
4155 * all the shift state (seen bits, number of bits seen, high
4156 * surrogate). */
4157
Alexander Belopolsky40018472011-02-26 01:02:56 +00004158PyObject *
4159PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004160 Py_ssize_t size,
4161 const char *errors,
4162 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004163{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004164 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004165 Py_ssize_t startinpos;
4166 Py_ssize_t endinpos;
4167 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004168 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004169 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004170 const char *errmsg = "";
4171 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004172 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004173 unsigned int base64bits = 0;
4174 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004175 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 PyObject *errorHandler = NULL;
4177 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004178
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004179 /* Start off assuming it's all ASCII. Widen later as necessary. */
4180 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004181 if (!unicode)
4182 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004183 if (size == 0) {
4184 if (consumed)
4185 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004186 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004187 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004188
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004189 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004190 e = s + size;
4191
4192 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004193 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004195 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004196
Antoine Pitrou244651a2009-05-04 18:56:13 +00004197 if (inShift) { /* in a base-64 section */
4198 if (IS_BASE64(ch)) { /* consume a base-64 character */
4199 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4200 base64bits += 6;
4201 s++;
4202 if (base64bits >= 16) {
4203 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004204 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004205 base64bits -= 16;
4206 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4207 if (surrogate) {
4208 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004209 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4210 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004211 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4212 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004213 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004214 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004215 }
4216 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004217 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4218 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004219 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004220 }
4221 }
Victor Stinner551ac952011-11-29 22:58:13 +01004222 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004223 /* first surrogate */
4224 surrogate = outCh;
4225 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004226 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004227 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4228 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004229 }
4230 }
4231 }
4232 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004233 inShift = 0;
4234 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004235 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004236 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4237 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004238 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004239 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004240 if (base64bits > 0) { /* left-over bits */
4241 if (base64bits >= 6) {
4242 /* We've seen at least one base-64 character */
4243 errmsg = "partial character in shift sequence";
4244 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004245 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004246 else {
4247 /* Some bits remain; they should be zero */
4248 if (base64buffer != 0) {
4249 errmsg = "non-zero padding bits in shift sequence";
4250 goto utf7Error;
4251 }
4252 }
4253 }
4254 if (ch != '-') {
4255 /* '-' is absorbed; other terminating
4256 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004257 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4258 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004259 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004260 }
4261 }
4262 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004264 s++; /* consume '+' */
4265 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004266 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004267 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4268 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004269 }
4270 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004271 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004272 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004273 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004274 }
4275 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004276 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004277 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4278 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004279 s++;
4280 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004281 else {
4282 startinpos = s-starts;
4283 s++;
4284 errmsg = "unexpected special character";
4285 goto utf7Error;
4286 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004287 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004288utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004289 endinpos = s-starts;
4290 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004291 errors, &errorHandler,
4292 "utf7", errmsg,
4293 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004294 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004295 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004296 }
4297
Antoine Pitrou244651a2009-05-04 18:56:13 +00004298 /* end of string */
4299
4300 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4301 /* if we're in an inconsistent state, that's an error */
4302 if (surrogate ||
4303 (base64bits >= 6) ||
4304 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004305 endinpos = size;
4306 if (unicode_decode_call_errorhandler(
4307 errors, &errorHandler,
4308 "utf7", "unterminated shift sequence",
4309 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004310 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004311 goto onError;
4312 if (s < e)
4313 goto restart;
4314 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004315 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004316
4317 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004318 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004319 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004320 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004321 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004322 }
4323 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004324 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004325 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004326 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004327
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004328 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004329 goto onError;
4330
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331 Py_XDECREF(errorHandler);
4332 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004333 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004334
Benjamin Peterson29060642009-01-31 22:14:21 +00004335 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004336 Py_XDECREF(errorHandler);
4337 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004338 Py_DECREF(unicode);
4339 return NULL;
4340}
4341
4342
Alexander Belopolsky40018472011-02-26 01:02:56 +00004343PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004344_PyUnicode_EncodeUTF7(PyObject *str,
4345 int base64SetO,
4346 int base64WhiteSpace,
4347 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004348{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004349 int kind;
4350 void *data;
4351 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004352 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004353 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004354 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004355 unsigned int base64bits = 0;
4356 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004357 char * out;
4358 char * start;
4359
Benjamin Petersonbac79492012-01-14 13:34:47 -05004360 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004361 return NULL;
4362 kind = PyUnicode_KIND(str);
4363 data = PyUnicode_DATA(str);
4364 len = PyUnicode_GET_LENGTH(str);
4365
4366 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004368
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004369 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004370 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004371 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004372 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004373 if (v == NULL)
4374 return NULL;
4375
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004376 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004377 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004378 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004379
Antoine Pitrou244651a2009-05-04 18:56:13 +00004380 if (inShift) {
4381 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4382 /* shifting out */
4383 if (base64bits) { /* output remaining bits */
4384 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4385 base64buffer = 0;
4386 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004387 }
4388 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004389 /* Characters not in the BASE64 set implicitly unshift the sequence
4390 so no '-' is required, except if the character is itself a '-' */
4391 if (IS_BASE64(ch) || ch == '-') {
4392 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004393 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004394 *out++ = (char) ch;
4395 }
4396 else {
4397 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004398 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004400 else { /* not in a shift sequence */
4401 if (ch == '+') {
4402 *out++ = '+';
4403 *out++ = '-';
4404 }
4405 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4406 *out++ = (char) ch;
4407 }
4408 else {
4409 *out++ = '+';
4410 inShift = 1;
4411 goto encode_char;
4412 }
4413 }
4414 continue;
4415encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004416 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004417 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004418
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419 /* code first surrogate */
4420 base64bits += 16;
4421 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4422 while (base64bits >= 6) {
4423 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4424 base64bits -= 6;
4425 }
4426 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004427 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004429 base64bits += 16;
4430 base64buffer = (base64buffer << 16) | ch;
4431 while (base64bits >= 6) {
4432 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4433 base64bits -= 6;
4434 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004435 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004436 if (base64bits)
4437 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4438 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004440 if (_PyBytes_Resize(&v, out - start) < 0)
4441 return NULL;
4442 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004443}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004444PyObject *
4445PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4446 Py_ssize_t size,
4447 int base64SetO,
4448 int base64WhiteSpace,
4449 const char *errors)
4450{
4451 PyObject *result;
4452 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4453 if (tmp == NULL)
4454 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004455 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004456 base64WhiteSpace, errors);
4457 Py_DECREF(tmp);
4458 return result;
4459}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004460
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461#undef IS_BASE64
4462#undef FROM_BASE64
4463#undef TO_BASE64
4464#undef DECODE_DIRECT
4465#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004466
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467/* --- UTF-8 Codec -------------------------------------------------------- */
4468
Alexander Belopolsky40018472011-02-26 01:02:56 +00004469PyObject *
4470PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004471 Py_ssize_t size,
4472 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004473{
Walter Dörwald69652032004-09-07 20:24:22 +00004474 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4475}
4476
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004477#include "stringlib/asciilib.h"
4478#include "stringlib/codecs.h"
4479#include "stringlib/undef.h"
4480
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004481#include "stringlib/ucs1lib.h"
4482#include "stringlib/codecs.h"
4483#include "stringlib/undef.h"
4484
4485#include "stringlib/ucs2lib.h"
4486#include "stringlib/codecs.h"
4487#include "stringlib/undef.h"
4488
4489#include "stringlib/ucs4lib.h"
4490#include "stringlib/codecs.h"
4491#include "stringlib/undef.h"
4492
Antoine Pitrouab868312009-01-10 15:40:25 +00004493/* Mask to quickly check whether a C 'long' contains a
4494 non-ASCII, UTF8-encoded char. */
4495#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004496# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004497#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004498# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004499#else
4500# error C 'long' size should be either 4 or 8!
4501#endif
4502
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004503static Py_ssize_t
4504ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004505{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004506 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004507 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004508
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004509#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004510 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4511 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004512 /* Fast path, see in STRINGLIB(utf8_decode) for
4513 an explanation. */
4514 /* Help register allocation */
4515 register const char *_p = p;
4516 register Py_UCS1 * q = dest;
4517 while (_p < aligned_end) {
4518 unsigned long value = *(const unsigned long *) _p;
4519 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004521 *((unsigned long *)q) = value;
4522 _p += SIZEOF_LONG;
4523 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004524 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004525 p = _p;
4526 while (p < end) {
4527 if ((unsigned char)*p & 0x80)
4528 break;
4529 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004531 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004532 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004533#endif
4534 while (p < end) {
4535 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4536 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004537 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004538 /* Help register allocation */
4539 register const char *_p = p;
4540 while (_p < aligned_end) {
4541 unsigned long value = *(unsigned long *) _p;
4542 if (value & ASCII_CHAR_MASK)
4543 break;
4544 _p += SIZEOF_LONG;
4545 }
4546 p = _p;
4547 if (_p == end)
4548 break;
4549 }
4550 if ((unsigned char)*p & 0x80)
4551 break;
4552 ++p;
4553 }
4554 memcpy(dest, start, p - start);
4555 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004556}
Antoine Pitrouab868312009-01-10 15:40:25 +00004557
Victor Stinner785938e2011-12-11 20:09:03 +01004558PyObject *
4559PyUnicode_DecodeUTF8Stateful(const char *s,
4560 Py_ssize_t size,
4561 const char *errors,
4562 Py_ssize_t *consumed)
4563{
Victor Stinner785938e2011-12-11 20:09:03 +01004564 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004565 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004566 const char *end = s + size;
4567 Py_ssize_t outpos;
4568
4569 Py_ssize_t startinpos;
4570 Py_ssize_t endinpos;
4571 const char *errmsg = "";
4572 PyObject *errorHandler = NULL;
4573 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004574
4575 if (size == 0) {
4576 if (consumed)
4577 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004578 Py_INCREF(unicode_empty);
4579 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004580 }
4581
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004582 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4583 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004584 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004585 *consumed = 1;
4586 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004587 }
4588
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004589 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004590 if (!unicode)
4591 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004592
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004593 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4594 s += outpos;
4595 while (s < end) {
4596 Py_UCS4 ch;
4597 int kind = PyUnicode_KIND(unicode);
4598 if (kind == PyUnicode_1BYTE_KIND) {
4599 if (PyUnicode_IS_ASCII(unicode))
4600 ch = asciilib_utf8_decode(&s, end,
4601 PyUnicode_1BYTE_DATA(unicode), &outpos);
4602 else
4603 ch = ucs1lib_utf8_decode(&s, end,
4604 PyUnicode_1BYTE_DATA(unicode), &outpos);
4605 } else if (kind == PyUnicode_2BYTE_KIND) {
4606 ch = ucs2lib_utf8_decode(&s, end,
4607 PyUnicode_2BYTE_DATA(unicode), &outpos);
4608 } else {
4609 assert(kind == PyUnicode_4BYTE_KIND);
4610 ch = ucs4lib_utf8_decode(&s, end,
4611 PyUnicode_4BYTE_DATA(unicode), &outpos);
4612 }
4613
4614 switch (ch) {
4615 case 0:
4616 if (s == end || consumed)
4617 goto End;
4618 errmsg = "unexpected end of data";
4619 startinpos = s - starts;
4620 endinpos = startinpos + 1;
4621 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4622 endinpos++;
4623 break;
4624 case 1:
4625 errmsg = "invalid start byte";
4626 startinpos = s - starts;
4627 endinpos = startinpos + 1;
4628 break;
4629 case 2:
4630 errmsg = "invalid continuation byte";
4631 startinpos = s - starts;
4632 endinpos = startinpos + 1;
4633 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4634 endinpos++;
4635 break;
4636 default:
4637 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4638 goto onError;
4639 continue;
4640 }
4641
4642 if (unicode_decode_call_errorhandler(
4643 errors, &errorHandler,
4644 "utf-8", errmsg,
4645 &starts, &end, &startinpos, &endinpos, &exc, &s,
4646 &unicode, &outpos))
4647 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004648 }
4649
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004650End:
4651 if (unicode_resize(&unicode, outpos) < 0)
4652 goto onError;
4653
4654 if (consumed)
4655 *consumed = s - starts;
4656
4657 Py_XDECREF(errorHandler);
4658 Py_XDECREF(exc);
4659 assert(_PyUnicode_CheckConsistency(unicode, 1));
4660 return unicode;
4661
4662onError:
4663 Py_XDECREF(errorHandler);
4664 Py_XDECREF(exc);
4665 Py_XDECREF(unicode);
4666 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004667}
4668
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004669#ifdef __APPLE__
4670
4671/* Simplified UTF-8 decoder using surrogateescape error handler,
4672 used to decode the command line arguments on Mac OS X. */
4673
4674wchar_t*
4675_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4676{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004677 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004678 wchar_t *unicode;
4679 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004680
4681 /* Note: size will always be longer than the resulting Unicode
4682 character count */
4683 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4684 PyErr_NoMemory();
4685 return NULL;
4686 }
4687 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4688 if (!unicode)
4689 return NULL;
4690
4691 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004692 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004693 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004694 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004695 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004696#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004697 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004698#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004699 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004700#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004701 if (ch > 0xFF) {
4702#if SIZEOF_WCHAR_T == 4
4703 assert(0);
4704#else
4705 assert(Py_UNICODE_IS_SURROGATE(ch));
4706 /* compute and append the two surrogates: */
4707 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4708 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4709#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004710 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004711 else {
4712 if (!ch && s == e)
4713 break;
4714 /* surrogateescape */
4715 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4716 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004717 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004718 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004719 return unicode;
4720}
4721
4722#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004724/* Primary internal function which creates utf8 encoded bytes objects.
4725
4726 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004727 and allocate exactly as much space needed at the end. Else allocate the
4728 maximum possible needed (4 result bytes per Unicode character), and return
4729 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004730*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004731PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004732_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733{
Victor Stinner6099a032011-12-18 14:22:26 +01004734 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004735 void *data;
4736 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004738 if (!PyUnicode_Check(unicode)) {
4739 PyErr_BadArgument();
4740 return NULL;
4741 }
4742
4743 if (PyUnicode_READY(unicode) == -1)
4744 return NULL;
4745
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004746 if (PyUnicode_UTF8(unicode))
4747 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4748 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004749
4750 kind = PyUnicode_KIND(unicode);
4751 data = PyUnicode_DATA(unicode);
4752 size = PyUnicode_GET_LENGTH(unicode);
4753
Benjamin Petersonead6b532011-12-20 17:23:42 -06004754 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004755 default:
4756 assert(0);
4757 case PyUnicode_1BYTE_KIND:
4758 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4759 assert(!PyUnicode_IS_ASCII(unicode));
4760 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4761 case PyUnicode_2BYTE_KIND:
4762 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4763 case PyUnicode_4BYTE_KIND:
4764 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766}
4767
Alexander Belopolsky40018472011-02-26 01:02:56 +00004768PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004769PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4770 Py_ssize_t size,
4771 const char *errors)
4772{
4773 PyObject *v, *unicode;
4774
4775 unicode = PyUnicode_FromUnicode(s, size);
4776 if (unicode == NULL)
4777 return NULL;
4778 v = _PyUnicode_AsUTF8String(unicode, errors);
4779 Py_DECREF(unicode);
4780 return v;
4781}
4782
4783PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004784PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004786 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787}
4788
Walter Dörwald41980ca2007-08-16 21:55:45 +00004789/* --- UTF-32 Codec ------------------------------------------------------- */
4790
4791PyObject *
4792PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004793 Py_ssize_t size,
4794 const char *errors,
4795 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004796{
4797 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4798}
4799
4800PyObject *
4801PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004802 Py_ssize_t size,
4803 const char *errors,
4804 int *byteorder,
4805 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004806{
4807 const char *starts = s;
4808 Py_ssize_t startinpos;
4809 Py_ssize_t endinpos;
4810 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004811 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004812 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004813 int bo = 0; /* assume native ordering by default */
4814 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004815 /* Offsets from q for retrieving bytes in the right order. */
4816#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4817 int iorder[] = {0, 1, 2, 3};
4818#else
4819 int iorder[] = {3, 2, 1, 0};
4820#endif
4821 PyObject *errorHandler = NULL;
4822 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004823
Walter Dörwald41980ca2007-08-16 21:55:45 +00004824 q = (unsigned char *)s;
4825 e = q + size;
4826
4827 if (byteorder)
4828 bo = *byteorder;
4829
4830 /* Check for BOM marks (U+FEFF) in the input and adjust current
4831 byte order setting accordingly. In native mode, the leading BOM
4832 mark is skipped, in all other modes, it is copied to the output
4833 stream as-is (giving a ZWNBSP character). */
4834 if (bo == 0) {
4835 if (size >= 4) {
4836 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004837 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004838#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004839 if (bom == 0x0000FEFF) {
4840 q += 4;
4841 bo = -1;
4842 }
4843 else if (bom == 0xFFFE0000) {
4844 q += 4;
4845 bo = 1;
4846 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004847#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004848 if (bom == 0x0000FEFF) {
4849 q += 4;
4850 bo = 1;
4851 }
4852 else if (bom == 0xFFFE0000) {
4853 q += 4;
4854 bo = -1;
4855 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004856#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004857 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004858 }
4859
4860 if (bo == -1) {
4861 /* force LE */
4862 iorder[0] = 0;
4863 iorder[1] = 1;
4864 iorder[2] = 2;
4865 iorder[3] = 3;
4866 }
4867 else if (bo == 1) {
4868 /* force BE */
4869 iorder[0] = 3;
4870 iorder[1] = 2;
4871 iorder[2] = 1;
4872 iorder[3] = 0;
4873 }
4874
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004875 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004876 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004877 if (!unicode)
4878 return NULL;
4879 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01004880 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004881 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004882
Walter Dörwald41980ca2007-08-16 21:55:45 +00004883 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004884 Py_UCS4 ch;
4885 /* remaining bytes at the end? (size should be divisible by 4) */
4886 if (e-q<4) {
4887 if (consumed)
4888 break;
4889 errmsg = "truncated data";
4890 startinpos = ((const char *)q)-starts;
4891 endinpos = ((const char *)e)-starts;
4892 goto utf32Error;
4893 /* The remaining input chars are ignored if the callback
4894 chooses to skip the input */
4895 }
4896 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4897 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004898
Benjamin Peterson29060642009-01-31 22:14:21 +00004899 if (ch >= 0x110000)
4900 {
4901 errmsg = "codepoint not in range(0x110000)";
4902 startinpos = ((const char *)q)-starts;
4903 endinpos = startinpos+4;
4904 goto utf32Error;
4905 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004906 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4907 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00004908 q += 4;
4909 continue;
4910 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004911 if (unicode_decode_call_errorhandler(
4912 errors, &errorHandler,
4913 "utf32", errmsg,
4914 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004915 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004916 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004917 }
4918
4919 if (byteorder)
4920 *byteorder = bo;
4921
4922 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004923 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004924
4925 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01004926 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004927 goto onError;
4928
4929 Py_XDECREF(errorHandler);
4930 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004931 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004932
Benjamin Peterson29060642009-01-31 22:14:21 +00004933 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004934 Py_DECREF(unicode);
4935 Py_XDECREF(errorHandler);
4936 Py_XDECREF(exc);
4937 return NULL;
4938}
4939
4940PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004941_PyUnicode_EncodeUTF32(PyObject *str,
4942 const char *errors,
4943 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004944{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004945 int kind;
4946 void *data;
4947 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004948 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004949 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004950 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004951 /* Offsets from p for storing byte pairs in the right order. */
4952#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4953 int iorder[] = {0, 1, 2, 3};
4954#else
4955 int iorder[] = {3, 2, 1, 0};
4956#endif
4957
Benjamin Peterson29060642009-01-31 22:14:21 +00004958#define STORECHAR(CH) \
4959 do { \
4960 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4961 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4962 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4963 p[iorder[0]] = (CH) & 0xff; \
4964 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004965 } while(0)
4966
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004967 if (!PyUnicode_Check(str)) {
4968 PyErr_BadArgument();
4969 return NULL;
4970 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004971 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004972 return NULL;
4973 kind = PyUnicode_KIND(str);
4974 data = PyUnicode_DATA(str);
4975 len = PyUnicode_GET_LENGTH(str);
4976
4977 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004978 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00004979 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004980 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004981 if (v == NULL)
4982 return NULL;
4983
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004984 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004985 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004987 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004988 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004989
4990 if (byteorder == -1) {
4991 /* force LE */
4992 iorder[0] = 0;
4993 iorder[1] = 1;
4994 iorder[2] = 2;
4995 iorder[3] = 3;
4996 }
4997 else if (byteorder == 1) {
4998 /* force BE */
4999 iorder[0] = 3;
5000 iorder[1] = 2;
5001 iorder[2] = 1;
5002 iorder[3] = 0;
5003 }
5004
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005005 for (i = 0; i < len; i++)
5006 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005007
5008 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005009 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005010#undef STORECHAR
5011}
5012
Alexander Belopolsky40018472011-02-26 01:02:56 +00005013PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005014PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5015 Py_ssize_t size,
5016 const char *errors,
5017 int byteorder)
5018{
5019 PyObject *result;
5020 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5021 if (tmp == NULL)
5022 return NULL;
5023 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5024 Py_DECREF(tmp);
5025 return result;
5026}
5027
5028PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005029PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005030{
Victor Stinnerb960b342011-11-20 19:12:52 +01005031 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005032}
5033
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034/* --- UTF-16 Codec ------------------------------------------------------- */
5035
Tim Peters772747b2001-08-09 22:21:55 +00005036PyObject *
5037PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 Py_ssize_t size,
5039 const char *errors,
5040 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041{
Walter Dörwald69652032004-09-07 20:24:22 +00005042 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5043}
5044
5045PyObject *
5046PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 Py_ssize_t size,
5048 const char *errors,
5049 int *byteorder,
5050 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005051{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005052 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005053 Py_ssize_t startinpos;
5054 Py_ssize_t endinpos;
5055 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005056 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005057 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005058 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005059 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005060 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005061 PyObject *errorHandler = NULL;
5062 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063
Tim Peters772747b2001-08-09 22:21:55 +00005064 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005065 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066
5067 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005068 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005070 /* Check for BOM marks (U+FEFF) in the input and adjust current
5071 byte order setting accordingly. In native mode, the leading BOM
5072 mark is skipped, in all other modes, it is copied to the output
5073 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005074 if (bo == 0 && size >= 2) {
5075 const Py_UCS4 bom = (q[1] << 8) | q[0];
5076 if (bom == 0xFEFF) {
5077 q += 2;
5078 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005079 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005080 else if (bom == 0xFFFE) {
5081 q += 2;
5082 bo = 1;
5083 }
5084 if (byteorder)
5085 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005086 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005087
Antoine Pitrou63065d72012-05-15 23:48:04 +02005088 if (q == e) {
5089 if (consumed)
5090 *consumed = size;
5091 Py_INCREF(unicode_empty);
5092 return unicode_empty;
Tim Peters772747b2001-08-09 22:21:55 +00005093 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005094
Antoine Pitrouab868312009-01-10 15:40:25 +00005095#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005096 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005097#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005098 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005099#endif
Tim Peters772747b2001-08-09 22:21:55 +00005100
Antoine Pitrou63065d72012-05-15 23:48:04 +02005101 /* Note: size will always be longer than the resulting Unicode
5102 character count */
5103 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5104 if (!unicode)
5105 return NULL;
5106
5107 outpos = 0;
5108 while (1) {
5109 Py_UCS4 ch = 0;
5110 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005111 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005112 if (kind == PyUnicode_1BYTE_KIND) {
5113 if (PyUnicode_IS_ASCII(unicode))
5114 ch = asciilib_utf16_decode(&q, e,
5115 PyUnicode_1BYTE_DATA(unicode), &outpos,
5116 native_ordering);
5117 else
5118 ch = ucs1lib_utf16_decode(&q, e,
5119 PyUnicode_1BYTE_DATA(unicode), &outpos,
5120 native_ordering);
5121 } else if (kind == PyUnicode_2BYTE_KIND) {
5122 ch = ucs2lib_utf16_decode(&q, e,
5123 PyUnicode_2BYTE_DATA(unicode), &outpos,
5124 native_ordering);
5125 } else {
5126 assert(kind == PyUnicode_4BYTE_KIND);
5127 ch = ucs4lib_utf16_decode(&q, e,
5128 PyUnicode_4BYTE_DATA(unicode), &outpos,
5129 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005130 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005131 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005132
Antoine Pitrou63065d72012-05-15 23:48:04 +02005133 switch (ch)
5134 {
5135 case 0:
5136 /* remaining byte at the end? (size should be even) */
5137 if (q == e || consumed)
5138 goto End;
5139 errmsg = "truncated data";
5140 startinpos = ((const char *)q) - starts;
5141 endinpos = ((const char *)e) - starts;
5142 break;
5143 /* The remaining input chars are ignored if the callback
5144 chooses to skip the input */
5145 case 1:
5146 errmsg = "unexpected end of data";
5147 startinpos = ((const char *)q) - 2 - starts;
5148 endinpos = ((const char *)e) - starts;
5149 break;
5150 case 2:
5151 errmsg = "illegal encoding";
5152 startinpos = ((const char *)q) - 2 - starts;
5153 endinpos = startinpos + 2;
5154 break;
5155 case 3:
5156 errmsg = "illegal UTF-16 surrogate";
5157 startinpos = ((const char *)q) - 4 - starts;
5158 endinpos = startinpos + 2;
5159 break;
5160 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005161 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5162 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005163 continue;
5164 }
5165
Benjamin Peterson29060642009-01-31 22:14:21 +00005166 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005167 errors,
5168 &errorHandler,
5169 "utf16", errmsg,
5170 &starts,
5171 (const char **)&e,
5172 &startinpos,
5173 &endinpos,
5174 &exc,
5175 (const char **)&q,
5176 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005177 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005178 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 }
5180
Antoine Pitrou63065d72012-05-15 23:48:04 +02005181End:
Walter Dörwald69652032004-09-07 20:24:22 +00005182 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005183 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005184
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005186 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187 goto onError;
5188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005189 Py_XDECREF(errorHandler);
5190 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005191 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005195 Py_XDECREF(errorHandler);
5196 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 return NULL;
5198}
5199
Tim Peters772747b2001-08-09 22:21:55 +00005200PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005201_PyUnicode_EncodeUTF16(PyObject *str,
5202 const char *errors,
5203 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005205 enum PyUnicode_Kind kind;
5206 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005207 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005208 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005209 unsigned short *out;
5210 Py_ssize_t bytesize;
5211 Py_ssize_t pairs;
5212#ifdef WORDS_BIGENDIAN
5213 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005214#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005215 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005216#endif
5217
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005218 if (!PyUnicode_Check(str)) {
5219 PyErr_BadArgument();
5220 return NULL;
5221 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005222 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005223 return NULL;
5224 kind = PyUnicode_KIND(str);
5225 data = PyUnicode_DATA(str);
5226 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005227
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005228 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005229 if (kind == PyUnicode_4BYTE_KIND) {
5230 const Py_UCS4 *in = (const Py_UCS4 *)data;
5231 const Py_UCS4 *end = in + len;
5232 while (in < end)
5233 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005234 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005235 }
5236 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005237 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005238 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005239 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 if (v == NULL)
5241 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005243 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005244 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005245 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005246 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005247 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005248 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005249 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005250
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005251 switch (kind) {
5252 case PyUnicode_1BYTE_KIND: {
5253 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5254 break;
Tim Peters772747b2001-08-09 22:21:55 +00005255 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005256 case PyUnicode_2BYTE_KIND: {
5257 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5258 break;
Tim Peters772747b2001-08-09 22:21:55 +00005259 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005260 case PyUnicode_4BYTE_KIND: {
5261 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5262 break;
5263 }
5264 default:
5265 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005266 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005267
5268 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005269 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270}
5271
Alexander Belopolsky40018472011-02-26 01:02:56 +00005272PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005273PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5274 Py_ssize_t size,
5275 const char *errors,
5276 int byteorder)
5277{
5278 PyObject *result;
5279 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5280 if (tmp == NULL)
5281 return NULL;
5282 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5283 Py_DECREF(tmp);
5284 return result;
5285}
5286
5287PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005288PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005290 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291}
5292
5293/* --- Unicode Escape Codec ----------------------------------------------- */
5294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005295/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5296 if all the escapes in the string make it still a valid ASCII string.
5297 Returns -1 if any escapes were found which cause the string to
5298 pop out of ASCII range. Otherwise returns the length of the
5299 required buffer to hold the string.
5300 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005301static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005302length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5303{
5304 const unsigned char *p = (const unsigned char *)s;
5305 const unsigned char *end = p + size;
5306 Py_ssize_t length = 0;
5307
5308 if (size < 0)
5309 return -1;
5310
5311 for (; p < end; ++p) {
5312 if (*p > 127) {
5313 /* Non-ASCII */
5314 return -1;
5315 }
5316 else if (*p != '\\') {
5317 /* Normal character */
5318 ++length;
5319 }
5320 else {
5321 /* Backslash-escape, check next char */
5322 ++p;
5323 /* Escape sequence reaches till end of string or
5324 non-ASCII follow-up. */
5325 if (p >= end || *p > 127)
5326 return -1;
5327 switch (*p) {
5328 case '\n':
5329 /* backslash + \n result in zero characters */
5330 break;
5331 case '\\': case '\'': case '\"':
5332 case 'b': case 'f': case 't':
5333 case 'n': case 'r': case 'v': case 'a':
5334 ++length;
5335 break;
5336 case '0': case '1': case '2': case '3':
5337 case '4': case '5': case '6': case '7':
5338 case 'x': case 'u': case 'U': case 'N':
5339 /* these do not guarantee ASCII characters */
5340 return -1;
5341 default:
5342 /* count the backslash + the other character */
5343 length += 2;
5344 }
5345 }
5346 }
5347 return length;
5348}
5349
Fredrik Lundh06d12682001-01-24 07:59:11 +00005350static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005351
Alexander Belopolsky40018472011-02-26 01:02:56 +00005352PyObject *
5353PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005354 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005355 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005356{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005357 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005358 Py_ssize_t startinpos;
5359 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005360 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005361 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005363 char* message;
5364 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005365 PyObject *errorHandler = NULL;
5366 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005367 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005368 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005369
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005370 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005371
5372 /* After length_of_escaped_ascii_string() there are two alternatives,
5373 either the string is pure ASCII with named escapes like \n, etc.
5374 and we determined it's exact size (common case)
5375 or it contains \x, \u, ... escape sequences. then we create a
5376 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005377 if (len >= 0) {
5378 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005379 if (!v)
5380 goto onError;
5381 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005382 }
5383 else {
5384 /* Escaped strings will always be longer than the resulting
5385 Unicode string, so we start with size here and then reduce the
5386 length after conversion to the true value.
5387 (but if the error callback returns a long replacement string
5388 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005389 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005390 if (!v)
5391 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005392 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005393 }
5394
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005396 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005397 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005398 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005399
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 while (s < end) {
5401 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005402 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005403 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005405 /* The only case in which i == ascii_length is a backslash
5406 followed by a newline. */
5407 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005408
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 /* Non-escape characters are interpreted as Unicode ordinals */
5410 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005411 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5412 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 continue;
5414 }
5415
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005416 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 /* \ - Escapes */
5418 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005419 c = *s++;
5420 if (s > end)
5421 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005422
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005423 /* The only case in which i == ascii_length is a backslash
5424 followed by a newline. */
5425 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005426
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005427 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005430#define WRITECHAR(ch) \
5431 do { \
5432 if (unicode_putchar(&v, &i, ch) < 0) \
5433 goto onError; \
5434 }while(0)
5435
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005437 case '\\': WRITECHAR('\\'); break;
5438 case '\'': WRITECHAR('\''); break;
5439 case '\"': WRITECHAR('\"'); break;
5440 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005441 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005442 case 'f': WRITECHAR('\014'); break;
5443 case 't': WRITECHAR('\t'); break;
5444 case 'n': WRITECHAR('\n'); break;
5445 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005446 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005447 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005448 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005449 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450
Benjamin Peterson29060642009-01-31 22:14:21 +00005451 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 case '0': case '1': case '2': case '3':
5453 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005454 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005455 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005456 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005457 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005458 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005460 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 break;
5462
Benjamin Peterson29060642009-01-31 22:14:21 +00005463 /* hex escapes */
5464 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005466 digits = 2;
5467 message = "truncated \\xXX escape";
5468 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469
Benjamin Peterson29060642009-01-31 22:14:21 +00005470 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005472 digits = 4;
5473 message = "truncated \\uXXXX escape";
5474 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475
Benjamin Peterson29060642009-01-31 22:14:21 +00005476 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005477 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005478 digits = 8;
5479 message = "truncated \\UXXXXXXXX escape";
5480 hexescape:
5481 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005482 if (s+digits>end) {
5483 endinpos = size;
5484 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005485 errors, &errorHandler,
5486 "unicodeescape", "end of string in escape sequence",
5487 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005488 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005489 goto onError;
5490 goto nextByte;
5491 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005492 for (j = 0; j < digits; ++j) {
5493 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005494 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005495 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005496 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 errors, &errorHandler,
5498 "unicodeescape", message,
5499 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005500 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005501 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005502 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005503 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005504 }
5505 chr = (chr<<4) & ~0xF;
5506 if (c >= '0' && c <= '9')
5507 chr += c - '0';
5508 else if (c >= 'a' && c <= 'f')
5509 chr += 10 + c - 'a';
5510 else
5511 chr += 10 + c - 'A';
5512 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005513 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005514 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515 /* _decoding_error will have already written into the
5516 target buffer. */
5517 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005518 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005519 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005520 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005521 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005522 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005523 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005524 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 errors, &errorHandler,
5526 "unicodeescape", "illegal Unicode character",
5527 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005528 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005529 goto onError;
5530 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005531 break;
5532
Benjamin Peterson29060642009-01-31 22:14:21 +00005533 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005534 case 'N':
5535 message = "malformed \\N character escape";
5536 if (ucnhash_CAPI == NULL) {
5537 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005538 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5539 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005540 if (ucnhash_CAPI == NULL)
5541 goto ucnhashError;
5542 }
5543 if (*s == '{') {
5544 const char *start = s+1;
5545 /* look for the closing brace */
5546 while (*s != '}' && s < end)
5547 s++;
5548 if (s > start && s < end && *s == '}') {
5549 /* found a name. look it up in the unicode database */
5550 message = "unknown Unicode character name";
5551 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005552 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005553 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005554 goto store;
5555 }
5556 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005557 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005558 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 errors, &errorHandler,
5560 "unicodeescape", message,
5561 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005562 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005563 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005564 break;
5565
5566 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005567 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005568 message = "\\ at end of string";
5569 s--;
5570 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005571 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005572 errors, &errorHandler,
5573 "unicodeescape", message,
5574 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005575 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005576 goto onError;
5577 }
5578 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005579 WRITECHAR('\\');
5580 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005581 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005582 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005583 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005584 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005585 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005587#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005588
Victor Stinner16e6a802011-12-12 13:24:15 +01005589 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005590 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005591 Py_XDECREF(errorHandler);
5592 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005593 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005594
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005596 PyErr_SetString(
5597 PyExc_UnicodeError,
5598 "\\N escapes not supported (can't load unicodedata module)"
5599 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005600 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005601 Py_XDECREF(errorHandler);
5602 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005603 return NULL;
5604
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005607 Py_XDECREF(errorHandler);
5608 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 return NULL;
5610}
5611
5612/* Return a Unicode-Escape string version of the Unicode object.
5613
5614 If quotes is true, the string is enclosed in u"" or u'' quotes as
5615 appropriate.
5616
5617*/
5618
Alexander Belopolsky40018472011-02-26 01:02:56 +00005619PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005620PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005622 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005623 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005625 int kind;
5626 void *data;
5627 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628
Ezio Melottie7f90372012-10-05 03:33:31 +03005629 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005630 escape.
5631
Ezio Melottie7f90372012-10-05 03:33:31 +03005632 For UCS1 strings it's '\xxx', 4 bytes per source character.
5633 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5634 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005635 */
5636
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005637 if (!PyUnicode_Check(unicode)) {
5638 PyErr_BadArgument();
5639 return NULL;
5640 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005641 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005642 return NULL;
5643 len = PyUnicode_GET_LENGTH(unicode);
5644 kind = PyUnicode_KIND(unicode);
5645 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005646 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005647 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5648 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5649 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5650 }
5651
5652 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005653 return PyBytes_FromStringAndSize(NULL, 0);
5654
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005655 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005656 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005657
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005658 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005659 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005660 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005661 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 if (repr == NULL)
5663 return NULL;
5664
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005665 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005667 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005668 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005669
Walter Dörwald79e913e2007-05-12 11:08:06 +00005670 /* Escape backslashes */
5671 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005672 *p++ = '\\';
5673 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005674 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005675 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005676
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005677 /* Map 21-bit characters to '\U00xxxxxx' */
5678 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005679 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005680 *p++ = '\\';
5681 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005682 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5683 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5684 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5685 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5686 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5687 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5688 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5689 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005690 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005691 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005692
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005694 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 *p++ = '\\';
5696 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005697 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5698 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5699 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5700 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005702
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005703 /* Map special whitespace to '\t', \n', '\r' */
5704 else if (ch == '\t') {
5705 *p++ = '\\';
5706 *p++ = 't';
5707 }
5708 else if (ch == '\n') {
5709 *p++ = '\\';
5710 *p++ = 'n';
5711 }
5712 else if (ch == '\r') {
5713 *p++ = '\\';
5714 *p++ = 'r';
5715 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005716
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005717 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005718 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005720 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005721 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5722 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005723 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005724
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725 /* Copy everything else as-is */
5726 else
5727 *p++ = (char) ch;
5728 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005730 assert(p - PyBytes_AS_STRING(repr) > 0);
5731 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5732 return NULL;
5733 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734}
5735
Alexander Belopolsky40018472011-02-26 01:02:56 +00005736PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005737PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5738 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005740 PyObject *result;
5741 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5742 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005744 result = PyUnicode_AsUnicodeEscapeString(tmp);
5745 Py_DECREF(tmp);
5746 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747}
5748
5749/* --- Raw Unicode Escape Codec ------------------------------------------- */
5750
Alexander Belopolsky40018472011-02-26 01:02:56 +00005751PyObject *
5752PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005753 Py_ssize_t size,
5754 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005757 Py_ssize_t startinpos;
5758 Py_ssize_t endinpos;
5759 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005760 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 const char *end;
5762 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005763 PyObject *errorHandler = NULL;
5764 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005765
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 /* Escaped strings will always be longer than the resulting
5767 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768 length after conversion to the true value. (But decoding error
5769 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005770 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005773 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005774 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005775 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005776 end = s + size;
5777 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 unsigned char c;
5779 Py_UCS4 x;
5780 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005781 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 /* Non-escape characters are interpreted as Unicode ordinals */
5784 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005785 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5786 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005788 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005789 startinpos = s-starts;
5790
5791 /* \u-escapes are only interpreted iff the number of leading
5792 backslashes if odd */
5793 bs = s;
5794 for (;s < end;) {
5795 if (*s != '\\')
5796 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005797 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5798 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005799 }
5800 if (((s - bs) & 1) == 0 ||
5801 s >= end ||
5802 (*s != 'u' && *s != 'U')) {
5803 continue;
5804 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005805 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 count = *s=='u' ? 4 : 8;
5807 s++;
5808
5809 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 for (x = 0, i = 0; i < count; ++i, ++s) {
5811 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005812 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 endinpos = s-starts;
5814 if (unicode_decode_call_errorhandler(
5815 errors, &errorHandler,
5816 "rawunicodeescape", "truncated \\uXXXX",
5817 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005818 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 goto onError;
5820 goto nextByte;
5821 }
5822 x = (x<<4) & ~0xF;
5823 if (c >= '0' && c <= '9')
5824 x += c - '0';
5825 else if (c >= 'a' && c <= 'f')
5826 x += 10 + c - 'a';
5827 else
5828 x += 10 + c - 'A';
5829 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005830 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005831 if (unicode_putchar(&v, &outpos, x) < 0)
5832 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005833 } else {
5834 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005835 if (unicode_decode_call_errorhandler(
5836 errors, &errorHandler,
5837 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005839 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005840 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005841 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005842 nextByte:
5843 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 }
Victor Stinner16e6a802011-12-12 13:24:15 +01005845 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005846 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005847 Py_XDECREF(errorHandler);
5848 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005849 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00005850
Benjamin Peterson29060642009-01-31 22:14:21 +00005851 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005853 Py_XDECREF(errorHandler);
5854 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855 return NULL;
5856}
5857
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005858
Alexander Belopolsky40018472011-02-26 01:02:56 +00005859PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005860PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005862 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 char *p;
5864 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005865 Py_ssize_t expandsize, pos;
5866 int kind;
5867 void *data;
5868 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005870 if (!PyUnicode_Check(unicode)) {
5871 PyErr_BadArgument();
5872 return NULL;
5873 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005874 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005875 return NULL;
5876 kind = PyUnicode_KIND(unicode);
5877 data = PyUnicode_DATA(unicode);
5878 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005879 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5880 bytes, and 1 byte characters 4. */
5881 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005882
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005883 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005885
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005886 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887 if (repr == NULL)
5888 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005889 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005890 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005892 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005893 for (pos = 0; pos < len; pos++) {
5894 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 /* Map 32-bit characters to '\Uxxxxxxxx' */
5896 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005897 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005898 *p++ = '\\';
5899 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005900 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5901 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5902 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5903 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5904 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5905 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5906 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5907 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005908 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005909 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005910 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 *p++ = '\\';
5912 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005913 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5914 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5915 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5916 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 /* Copy everything else as-is */
5919 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 *p++ = (char) ch;
5921 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005922
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005923 assert(p > q);
5924 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005925 return NULL;
5926 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927}
5928
Alexander Belopolsky40018472011-02-26 01:02:56 +00005929PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005930PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5931 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005933 PyObject *result;
5934 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5935 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00005936 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005937 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5938 Py_DECREF(tmp);
5939 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940}
5941
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005942/* --- Unicode Internal Codec ------------------------------------------- */
5943
Alexander Belopolsky40018472011-02-26 01:02:56 +00005944PyObject *
5945_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005946 Py_ssize_t size,
5947 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005948{
5949 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005950 Py_ssize_t startinpos;
5951 Py_ssize_t endinpos;
5952 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005953 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005954 const char *end;
5955 const char *reason;
5956 PyObject *errorHandler = NULL;
5957 PyObject *exc = NULL;
5958
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005959 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02005960 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005961 1))
5962 return NULL;
5963
Thomas Wouters89f507f2006-12-13 04:49:30 +00005964 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005965 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005966 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005967 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005968 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005969 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005970 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005971 end = s + size;
5972
5973 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005974 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005975 Py_UCS4 ch;
5976 /* We copy the raw representation one byte at a time because the
5977 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005978 ((char *) &uch)[0] = s[0];
5979 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005980#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005981 ((char *) &uch)[2] = s[2];
5982 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005983#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005984 ch = uch;
5985
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005986 /* We have to sanity check the raw data, otherwise doom looms for
5987 some malformed UCS-4 data. */
5988 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00005989#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005990 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00005991#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005992 end-s < Py_UNICODE_SIZE
5993 )
Benjamin Peterson29060642009-01-31 22:14:21 +00005994 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005995 startinpos = s - starts;
5996 if (end-s < Py_UNICODE_SIZE) {
5997 endinpos = end-starts;
5998 reason = "truncated input";
5999 }
6000 else {
6001 endinpos = s - starts + Py_UNICODE_SIZE;
6002 reason = "illegal code point (> 0x10FFFF)";
6003 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006004 if (unicode_decode_call_errorhandler(
6005 errors, &errorHandler,
6006 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006007 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006008 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006009 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006010 continue;
6011 }
6012
6013 s += Py_UNICODE_SIZE;
6014#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006015 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006016 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006017 Py_UNICODE uch2;
6018 ((char *) &uch2)[0] = s[0];
6019 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006020 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006021 {
Victor Stinner551ac952011-11-29 22:58:13 +01006022 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006023 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006024 }
6025 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006026#endif
6027
6028 if (unicode_putchar(&v, &outpos, ch) < 0)
6029 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006030 }
6031
Victor Stinner16e6a802011-12-12 13:24:15 +01006032 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006033 goto onError;
6034 Py_XDECREF(errorHandler);
6035 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006036 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006037
Benjamin Peterson29060642009-01-31 22:14:21 +00006038 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006039 Py_XDECREF(v);
6040 Py_XDECREF(errorHandler);
6041 Py_XDECREF(exc);
6042 return NULL;
6043}
6044
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045/* --- Latin-1 Codec ------------------------------------------------------ */
6046
Alexander Belopolsky40018472011-02-26 01:02:56 +00006047PyObject *
6048PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006049 Py_ssize_t size,
6050 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006053 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054}
6055
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006056/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006057static void
6058make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006059 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006060 PyObject *unicode,
6061 Py_ssize_t startpos, Py_ssize_t endpos,
6062 const char *reason)
6063{
6064 if (*exceptionObject == NULL) {
6065 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006066 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006067 encoding, unicode, startpos, endpos, reason);
6068 }
6069 else {
6070 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6071 goto onError;
6072 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6073 goto onError;
6074 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6075 goto onError;
6076 return;
6077 onError:
6078 Py_DECREF(*exceptionObject);
6079 *exceptionObject = NULL;
6080 }
6081}
6082
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006083/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006084static void
6085raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006086 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006087 PyObject *unicode,
6088 Py_ssize_t startpos, Py_ssize_t endpos,
6089 const char *reason)
6090{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006091 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006092 encoding, unicode, startpos, endpos, reason);
6093 if (*exceptionObject != NULL)
6094 PyCodec_StrictErrors(*exceptionObject);
6095}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006096
6097/* error handling callback helper:
6098 build arguments, call the callback and check the arguments,
6099 put the result into newpos and return the replacement string, which
6100 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006101static PyObject *
6102unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006103 PyObject **errorHandler,
6104 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006105 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006106 Py_ssize_t startpos, Py_ssize_t endpos,
6107 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006108{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006109 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006110 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111 PyObject *restuple;
6112 PyObject *resunicode;
6113
6114 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006116 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006118 }
6119
Benjamin Petersonbac79492012-01-14 13:34:47 -05006120 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006121 return NULL;
6122 len = PyUnicode_GET_LENGTH(unicode);
6123
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006124 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006125 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006128
6129 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006132 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006133 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006134 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 Py_DECREF(restuple);
6136 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006137 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006138 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 &resunicode, newpos)) {
6140 Py_DECREF(restuple);
6141 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006142 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006143 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6144 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6145 Py_DECREF(restuple);
6146 return NULL;
6147 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006148 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006149 *newpos = len + *newpos;
6150 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006151 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6152 Py_DECREF(restuple);
6153 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006154 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006155 Py_INCREF(resunicode);
6156 Py_DECREF(restuple);
6157 return resunicode;
6158}
6159
Alexander Belopolsky40018472011-02-26 01:02:56 +00006160static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006161unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006162 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006163 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006164{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006165 /* input state */
6166 Py_ssize_t pos=0, size;
6167 int kind;
6168 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006169 /* output object */
6170 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006171 /* pointer into the output */
6172 char *str;
6173 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006174 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006175 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6176 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006177 PyObject *errorHandler = NULL;
6178 PyObject *exc = NULL;
6179 /* the following variable is used for caching string comparisons
6180 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6181 int known_errorHandler = -1;
6182
Benjamin Petersonbac79492012-01-14 13:34:47 -05006183 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006184 return NULL;
6185 size = PyUnicode_GET_LENGTH(unicode);
6186 kind = PyUnicode_KIND(unicode);
6187 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006188 /* allocate enough for a simple encoding without
6189 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006190 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006191 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006192 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006193 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006194 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006195 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006196 ressize = size;
6197
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006198 while (pos < size) {
6199 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006200
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 /* can we encode this? */
6202 if (c<limit) {
6203 /* no overflow check, because we know that the space is enough */
6204 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006205 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006206 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 Py_ssize_t requiredsize;
6209 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006210 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006212 Py_ssize_t collstart = pos;
6213 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006215 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006216 ++collend;
6217 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6218 if (known_errorHandler==-1) {
6219 if ((errors==NULL) || (!strcmp(errors, "strict")))
6220 known_errorHandler = 1;
6221 else if (!strcmp(errors, "replace"))
6222 known_errorHandler = 2;
6223 else if (!strcmp(errors, "ignore"))
6224 known_errorHandler = 3;
6225 else if (!strcmp(errors, "xmlcharrefreplace"))
6226 known_errorHandler = 4;
6227 else
6228 known_errorHandler = 0;
6229 }
6230 switch (known_errorHandler) {
6231 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006232 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006233 goto onError;
6234 case 2: /* replace */
6235 while (collstart++<collend)
6236 *str++ = '?'; /* fall through */
6237 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006238 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006239 break;
6240 case 4: /* xmlcharrefreplace */
6241 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006242 /* determine replacement size */
6243 for (i = collstart, repsize = 0; i < collend; ++i) {
6244 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6245 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006247 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006249 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006250 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006251 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006252 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006253 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006255 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006257 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006258 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006259 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006260 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006262 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 if (requiredsize > ressize) {
6264 if (requiredsize<2*ressize)
6265 requiredsize = 2*ressize;
6266 if (_PyBytes_Resize(&res, requiredsize))
6267 goto onError;
6268 str = PyBytes_AS_STRING(res) + respos;
6269 ressize = requiredsize;
6270 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006271 /* generate replacement */
6272 for (i = collstart; i < collend; ++i) {
6273 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006274 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006275 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 break;
6277 default:
6278 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006279 encoding, reason, unicode, &exc,
6280 collstart, collend, &newpos);
6281 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006282 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006284 if (PyBytes_Check(repunicode)) {
6285 /* Directly copy bytes result to output. */
6286 repsize = PyBytes_Size(repunicode);
6287 if (repsize > 1) {
6288 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006289 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006290 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6291 Py_DECREF(repunicode);
6292 goto onError;
6293 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006294 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006295 ressize += repsize-1;
6296 }
6297 memcpy(str, PyBytes_AsString(repunicode), repsize);
6298 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006299 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006300 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006301 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006302 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006303 /* need more space? (at least enough for what we
6304 have+the replacement+the rest of the string, so
6305 we won't have to check space for encodable characters) */
6306 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006307 repsize = PyUnicode_GET_LENGTH(repunicode);
6308 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 if (requiredsize > ressize) {
6310 if (requiredsize<2*ressize)
6311 requiredsize = 2*ressize;
6312 if (_PyBytes_Resize(&res, requiredsize)) {
6313 Py_DECREF(repunicode);
6314 goto onError;
6315 }
6316 str = PyBytes_AS_STRING(res) + respos;
6317 ressize = requiredsize;
6318 }
6319 /* check if there is anything unencodable in the replacement
6320 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006321 for (i = 0; repsize-->0; ++i, ++str) {
6322 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006323 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006324 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006325 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 Py_DECREF(repunicode);
6327 goto onError;
6328 }
6329 *str = (char)c;
6330 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006331 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006332 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006333 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006334 }
6335 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006336 /* Resize if we allocated to much */
6337 size = str - PyBytes_AS_STRING(res);
6338 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006339 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006340 if (_PyBytes_Resize(&res, size) < 0)
6341 goto onError;
6342 }
6343
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006344 Py_XDECREF(errorHandler);
6345 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006346 return res;
6347
6348 onError:
6349 Py_XDECREF(res);
6350 Py_XDECREF(errorHandler);
6351 Py_XDECREF(exc);
6352 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006353}
6354
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006355/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006356PyObject *
6357PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006358 Py_ssize_t size,
6359 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006360{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006361 PyObject *result;
6362 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6363 if (unicode == NULL)
6364 return NULL;
6365 result = unicode_encode_ucs1(unicode, errors, 256);
6366 Py_DECREF(unicode);
6367 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368}
6369
Alexander Belopolsky40018472011-02-26 01:02:56 +00006370PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006371_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372{
6373 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 PyErr_BadArgument();
6375 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006376 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006377 if (PyUnicode_READY(unicode) == -1)
6378 return NULL;
6379 /* Fast path: if it is a one-byte string, construct
6380 bytes object directly. */
6381 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6382 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6383 PyUnicode_GET_LENGTH(unicode));
6384 /* Non-Latin-1 characters present. Defer to above function to
6385 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006386 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006387}
6388
6389PyObject*
6390PyUnicode_AsLatin1String(PyObject *unicode)
6391{
6392 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393}
6394
6395/* --- 7-bit ASCII Codec -------------------------------------------------- */
6396
Alexander Belopolsky40018472011-02-26 01:02:56 +00006397PyObject *
6398PyUnicode_DecodeASCII(const char *s,
6399 Py_ssize_t size,
6400 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006402 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006403 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006404 int kind;
6405 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006406 Py_ssize_t startinpos;
6407 Py_ssize_t endinpos;
6408 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006409 const char *e;
6410 PyObject *errorHandler = NULL;
6411 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006412
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006413 if (size == 0) {
6414 Py_INCREF(unicode_empty);
6415 return unicode_empty;
6416 }
6417
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006419 if (size == 1 && (unsigned char)s[0] < 128)
6420 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006421
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006422 unicode = PyUnicode_New(size, 127);
6423 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006425
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006426 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006427 data = PyUnicode_1BYTE_DATA(unicode);
6428 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6429 if (outpos == size)
6430 return unicode;
6431
6432 s += outpos;
6433 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 register unsigned char c = (unsigned char)*s;
6436 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006437 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006438 ++s;
6439 }
6440 else {
6441 startinpos = s-starts;
6442 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 if (unicode_decode_call_errorhandler(
6444 errors, &errorHandler,
6445 "ascii", "ordinal not in range(128)",
6446 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006447 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006449 kind = PyUnicode_KIND(unicode);
6450 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006451 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006453 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006454 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455 Py_XDECREF(errorHandler);
6456 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006457 assert(_PyUnicode_CheckConsistency(unicode, 1));
6458 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006459
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006461 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006462 Py_XDECREF(errorHandler);
6463 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006464 return NULL;
6465}
6466
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006467/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006468PyObject *
6469PyUnicode_EncodeASCII(const Py_UNICODE *p,
6470 Py_ssize_t size,
6471 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006472{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006473 PyObject *result;
6474 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6475 if (unicode == NULL)
6476 return NULL;
6477 result = unicode_encode_ucs1(unicode, errors, 128);
6478 Py_DECREF(unicode);
6479 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480}
6481
Alexander Belopolsky40018472011-02-26 01:02:56 +00006482PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006483_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484{
6485 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006486 PyErr_BadArgument();
6487 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006489 if (PyUnicode_READY(unicode) == -1)
6490 return NULL;
6491 /* Fast path: if it is an ASCII-only string, construct bytes object
6492 directly. Else defer to above function to raise the exception. */
6493 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6494 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6495 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006496 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006497}
6498
6499PyObject *
6500PyUnicode_AsASCIIString(PyObject *unicode)
6501{
6502 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503}
6504
Victor Stinner99b95382011-07-04 14:23:54 +02006505#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006506
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006507/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006508
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006509#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006510#define NEED_RETRY
6511#endif
6512
Victor Stinner3a50e702011-10-18 21:21:00 +02006513#ifndef WC_ERR_INVALID_CHARS
6514# define WC_ERR_INVALID_CHARS 0x0080
6515#endif
6516
6517static char*
6518code_page_name(UINT code_page, PyObject **obj)
6519{
6520 *obj = NULL;
6521 if (code_page == CP_ACP)
6522 return "mbcs";
6523 if (code_page == CP_UTF7)
6524 return "CP_UTF7";
6525 if (code_page == CP_UTF8)
6526 return "CP_UTF8";
6527
6528 *obj = PyBytes_FromFormat("cp%u", code_page);
6529 if (*obj == NULL)
6530 return NULL;
6531 return PyBytes_AS_STRING(*obj);
6532}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006533
Alexander Belopolsky40018472011-02-26 01:02:56 +00006534static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006535is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006536{
6537 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006538 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006539
Victor Stinner3a50e702011-10-18 21:21:00 +02006540 if (!IsDBCSLeadByteEx(code_page, *curr))
6541 return 0;
6542
6543 prev = CharPrevExA(code_page, s, curr, 0);
6544 if (prev == curr)
6545 return 1;
6546 /* FIXME: This code is limited to "true" double-byte encodings,
6547 as it assumes an incomplete character consists of a single
6548 byte. */
6549 if (curr - prev == 2)
6550 return 1;
6551 if (!IsDBCSLeadByteEx(code_page, *prev))
6552 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006553 return 0;
6554}
6555
Victor Stinner3a50e702011-10-18 21:21:00 +02006556static DWORD
6557decode_code_page_flags(UINT code_page)
6558{
6559 if (code_page == CP_UTF7) {
6560 /* The CP_UTF7 decoder only supports flags=0 */
6561 return 0;
6562 }
6563 else
6564 return MB_ERR_INVALID_CHARS;
6565}
6566
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006567/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006568 * Decode a byte string from a Windows code page into unicode object in strict
6569 * mode.
6570 *
6571 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6572 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006573 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006574static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006575decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006576 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006577 const char *in,
6578 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006579{
Victor Stinner3a50e702011-10-18 21:21:00 +02006580 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006581 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006582 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006583
6584 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006585 assert(insize > 0);
6586 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6587 if (outsize <= 0)
6588 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006589
6590 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006591 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006592 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006593 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006594 if (*v == NULL)
6595 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006596 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006597 }
6598 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006599 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006600 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006601 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006602 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006603 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006604 }
6605
6606 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006607 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6608 if (outsize <= 0)
6609 goto error;
6610 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006611
Victor Stinner3a50e702011-10-18 21:21:00 +02006612error:
6613 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6614 return -2;
6615 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006616 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006617}
6618
Victor Stinner3a50e702011-10-18 21:21:00 +02006619/*
6620 * Decode a byte string from a code page into unicode object with an error
6621 * handler.
6622 *
6623 * Returns consumed size if succeed, or raise a WindowsError or
6624 * UnicodeDecodeError exception and returns -1 on error.
6625 */
6626static int
6627decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006628 PyObject **v,
6629 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006630 const char *errors)
6631{
6632 const char *startin = in;
6633 const char *endin = in + size;
6634 const DWORD flags = decode_code_page_flags(code_page);
6635 /* Ideally, we should get reason from FormatMessage. This is the Windows
6636 2000 English version of the message. */
6637 const char *reason = "No mapping for the Unicode character exists "
6638 "in the target code page.";
6639 /* each step cannot decode more than 1 character, but a character can be
6640 represented as a surrogate pair */
6641 wchar_t buffer[2], *startout, *out;
6642 int insize, outsize;
6643 PyObject *errorHandler = NULL;
6644 PyObject *exc = NULL;
6645 PyObject *encoding_obj = NULL;
6646 char *encoding;
6647 DWORD err;
6648 int ret = -1;
6649
6650 assert(size > 0);
6651
6652 encoding = code_page_name(code_page, &encoding_obj);
6653 if (encoding == NULL)
6654 return -1;
6655
6656 if (errors == NULL || strcmp(errors, "strict") == 0) {
6657 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6658 UnicodeDecodeError. */
6659 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6660 if (exc != NULL) {
6661 PyCodec_StrictErrors(exc);
6662 Py_CLEAR(exc);
6663 }
6664 goto error;
6665 }
6666
6667 if (*v == NULL) {
6668 /* Create unicode object */
6669 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6670 PyErr_NoMemory();
6671 goto error;
6672 }
Victor Stinnerab595942011-12-17 04:59:06 +01006673 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006674 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006675 if (*v == NULL)
6676 goto error;
6677 startout = PyUnicode_AS_UNICODE(*v);
6678 }
6679 else {
6680 /* Extend unicode object */
6681 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6682 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6683 PyErr_NoMemory();
6684 goto error;
6685 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006686 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006687 goto error;
6688 startout = PyUnicode_AS_UNICODE(*v) + n;
6689 }
6690
6691 /* Decode the byte string character per character */
6692 out = startout;
6693 while (in < endin)
6694 {
6695 /* Decode a character */
6696 insize = 1;
6697 do
6698 {
6699 outsize = MultiByteToWideChar(code_page, flags,
6700 in, insize,
6701 buffer, Py_ARRAY_LENGTH(buffer));
6702 if (outsize > 0)
6703 break;
6704 err = GetLastError();
6705 if (err != ERROR_NO_UNICODE_TRANSLATION
6706 && err != ERROR_INSUFFICIENT_BUFFER)
6707 {
6708 PyErr_SetFromWindowsErr(0);
6709 goto error;
6710 }
6711 insize++;
6712 }
6713 /* 4=maximum length of a UTF-8 sequence */
6714 while (insize <= 4 && (in + insize) <= endin);
6715
6716 if (outsize <= 0) {
6717 Py_ssize_t startinpos, endinpos, outpos;
6718
6719 startinpos = in - startin;
6720 endinpos = startinpos + 1;
6721 outpos = out - PyUnicode_AS_UNICODE(*v);
6722 if (unicode_decode_call_errorhandler(
6723 errors, &errorHandler,
6724 encoding, reason,
6725 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006726 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006727 {
6728 goto error;
6729 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006730 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006731 }
6732 else {
6733 in += insize;
6734 memcpy(out, buffer, outsize * sizeof(wchar_t));
6735 out += outsize;
6736 }
6737 }
6738
6739 /* write a NUL character at the end */
6740 *out = 0;
6741
6742 /* Extend unicode object */
6743 outsize = out - startout;
6744 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006745 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006746 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006747 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006748
6749error:
6750 Py_XDECREF(encoding_obj);
6751 Py_XDECREF(errorHandler);
6752 Py_XDECREF(exc);
6753 return ret;
6754}
6755
Victor Stinner3a50e702011-10-18 21:21:00 +02006756static PyObject *
6757decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006758 const char *s, Py_ssize_t size,
6759 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006760{
Victor Stinner76a31a62011-11-04 00:05:13 +01006761 PyObject *v = NULL;
6762 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006763
Victor Stinner3a50e702011-10-18 21:21:00 +02006764 if (code_page < 0) {
6765 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6766 return NULL;
6767 }
6768
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006769 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006771
Victor Stinner76a31a62011-11-04 00:05:13 +01006772 do
6773 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006774#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006775 if (size > INT_MAX) {
6776 chunk_size = INT_MAX;
6777 final = 0;
6778 done = 0;
6779 }
6780 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006781#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006782 {
6783 chunk_size = (int)size;
6784 final = (consumed == NULL);
6785 done = 1;
6786 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006787
Victor Stinner76a31a62011-11-04 00:05:13 +01006788 /* Skip trailing lead-byte unless 'final' is set */
6789 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6790 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006791
Victor Stinner76a31a62011-11-04 00:05:13 +01006792 if (chunk_size == 0 && done) {
6793 if (v != NULL)
6794 break;
6795 Py_INCREF(unicode_empty);
6796 return unicode_empty;
6797 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006798
Victor Stinner76a31a62011-11-04 00:05:13 +01006799
6800 converted = decode_code_page_strict(code_page, &v,
6801 s, chunk_size);
6802 if (converted == -2)
6803 converted = decode_code_page_errors(code_page, &v,
6804 s, chunk_size,
6805 errors);
6806 assert(converted != 0);
6807
6808 if (converted < 0) {
6809 Py_XDECREF(v);
6810 return NULL;
6811 }
6812
6813 if (consumed)
6814 *consumed += converted;
6815
6816 s += converted;
6817 size -= converted;
6818 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006819
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006820 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006821}
6822
Alexander Belopolsky40018472011-02-26 01:02:56 +00006823PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006824PyUnicode_DecodeCodePageStateful(int code_page,
6825 const char *s,
6826 Py_ssize_t size,
6827 const char *errors,
6828 Py_ssize_t *consumed)
6829{
6830 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6831}
6832
6833PyObject *
6834PyUnicode_DecodeMBCSStateful(const char *s,
6835 Py_ssize_t size,
6836 const char *errors,
6837 Py_ssize_t *consumed)
6838{
6839 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6840}
6841
6842PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006843PyUnicode_DecodeMBCS(const char *s,
6844 Py_ssize_t size,
6845 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006846{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006847 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6848}
6849
Victor Stinner3a50e702011-10-18 21:21:00 +02006850static DWORD
6851encode_code_page_flags(UINT code_page, const char *errors)
6852{
6853 if (code_page == CP_UTF8) {
6854 if (winver.dwMajorVersion >= 6)
6855 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6856 and later */
6857 return WC_ERR_INVALID_CHARS;
6858 else
6859 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6860 return 0;
6861 }
6862 else if (code_page == CP_UTF7) {
6863 /* CP_UTF7 only supports flags=0 */
6864 return 0;
6865 }
6866 else {
6867 if (errors != NULL && strcmp(errors, "replace") == 0)
6868 return 0;
6869 else
6870 return WC_NO_BEST_FIT_CHARS;
6871 }
6872}
6873
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006874/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006875 * Encode a Unicode string to a Windows code page into a byte string in strict
6876 * mode.
6877 *
6878 * Returns consumed characters if succeed, returns -2 on encode error, or raise
6879 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006880 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006881static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006882encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006883 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006884 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006885{
Victor Stinner554f3f02010-06-16 23:33:54 +00006886 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006887 BOOL *pusedDefaultChar = &usedDefaultChar;
6888 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006889 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006890 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006891 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006892 const DWORD flags = encode_code_page_flags(code_page, NULL);
6893 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006894 /* Create a substring so that we can get the UTF-16 representation
6895 of just the slice under consideration. */
6896 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006897
Martin v. Löwis3d325192011-11-04 18:23:06 +01006898 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006899
Victor Stinner3a50e702011-10-18 21:21:00 +02006900 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006901 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006902 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006903 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006904
Victor Stinner2fc507f2011-11-04 20:06:39 +01006905 substring = PyUnicode_Substring(unicode, offset, offset+len);
6906 if (substring == NULL)
6907 return -1;
6908 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6909 if (p == NULL) {
6910 Py_DECREF(substring);
6911 return -1;
6912 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01006913
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006914 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006915 outsize = WideCharToMultiByte(code_page, flags,
6916 p, size,
6917 NULL, 0,
6918 NULL, pusedDefaultChar);
6919 if (outsize <= 0)
6920 goto error;
6921 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01006922 if (pusedDefaultChar && *pusedDefaultChar) {
6923 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006924 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006925 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006926
Victor Stinner3a50e702011-10-18 21:21:00 +02006927 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006929 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006930 if (*outbytes == NULL) {
6931 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006933 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006934 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006935 }
6936 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006938 const Py_ssize_t n = PyBytes_Size(*outbytes);
6939 if (outsize > PY_SSIZE_T_MAX - n) {
6940 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01006941 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006942 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006943 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01006944 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6945 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006946 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006947 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006948 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006949 }
6950
6951 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006952 outsize = WideCharToMultiByte(code_page, flags,
6953 p, size,
6954 out, outsize,
6955 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006956 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006957 if (outsize <= 0)
6958 goto error;
6959 if (pusedDefaultChar && *pusedDefaultChar)
6960 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006961 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006962
Victor Stinner3a50e702011-10-18 21:21:00 +02006963error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01006964 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006965 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6966 return -2;
6967 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006968 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006969}
6970
Victor Stinner3a50e702011-10-18 21:21:00 +02006971/*
6972 * Encode a Unicode string to a Windows code page into a byte string using a
6973 * error handler.
6974 *
6975 * Returns consumed characters if succeed, or raise a WindowsError and returns
6976 * -1 on other error.
6977 */
6978static int
6979encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01006980 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006981 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006982{
Victor Stinner3a50e702011-10-18 21:21:00 +02006983 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006984 Py_ssize_t pos = unicode_offset;
6985 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006986 /* Ideally, we should get reason from FormatMessage. This is the Windows
6987 2000 English version of the message. */
6988 const char *reason = "invalid character";
6989 /* 4=maximum length of a UTF-8 sequence */
6990 char buffer[4];
6991 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
6992 Py_ssize_t outsize;
6993 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006994 PyObject *errorHandler = NULL;
6995 PyObject *exc = NULL;
6996 PyObject *encoding_obj = NULL;
6997 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01006998 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006999 PyObject *rep;
7000 int ret = -1;
7001
7002 assert(insize > 0);
7003
7004 encoding = code_page_name(code_page, &encoding_obj);
7005 if (encoding == NULL)
7006 return -1;
7007
7008 if (errors == NULL || strcmp(errors, "strict") == 0) {
7009 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7010 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007011 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007012 if (exc != NULL) {
7013 PyCodec_StrictErrors(exc);
7014 Py_DECREF(exc);
7015 }
7016 Py_XDECREF(encoding_obj);
7017 return -1;
7018 }
7019
7020 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7021 pusedDefaultChar = &usedDefaultChar;
7022 else
7023 pusedDefaultChar = NULL;
7024
7025 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7026 PyErr_NoMemory();
7027 goto error;
7028 }
7029 outsize = insize * Py_ARRAY_LENGTH(buffer);
7030
7031 if (*outbytes == NULL) {
7032 /* Create string object */
7033 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7034 if (*outbytes == NULL)
7035 goto error;
7036 out = PyBytes_AS_STRING(*outbytes);
7037 }
7038 else {
7039 /* Extend string object */
7040 Py_ssize_t n = PyBytes_Size(*outbytes);
7041 if (n > PY_SSIZE_T_MAX - outsize) {
7042 PyErr_NoMemory();
7043 goto error;
7044 }
7045 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7046 goto error;
7047 out = PyBytes_AS_STRING(*outbytes) + n;
7048 }
7049
7050 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007051 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007052 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007053 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7054 wchar_t chars[2];
7055 int charsize;
7056 if (ch < 0x10000) {
7057 chars[0] = (wchar_t)ch;
7058 charsize = 1;
7059 }
7060 else {
7061 ch -= 0x10000;
7062 chars[0] = 0xd800 + (ch >> 10);
7063 chars[1] = 0xdc00 + (ch & 0x3ff);
7064 charsize = 2;
7065 }
7066
Victor Stinner3a50e702011-10-18 21:21:00 +02007067 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007068 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007069 buffer, Py_ARRAY_LENGTH(buffer),
7070 NULL, pusedDefaultChar);
7071 if (outsize > 0) {
7072 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7073 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007074 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007075 memcpy(out, buffer, outsize);
7076 out += outsize;
7077 continue;
7078 }
7079 }
7080 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7081 PyErr_SetFromWindowsErr(0);
7082 goto error;
7083 }
7084
Victor Stinner3a50e702011-10-18 21:21:00 +02007085 rep = unicode_encode_call_errorhandler(
7086 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007087 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007088 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007089 if (rep == NULL)
7090 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007091 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007092
7093 if (PyBytes_Check(rep)) {
7094 outsize = PyBytes_GET_SIZE(rep);
7095 if (outsize != 1) {
7096 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7097 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7098 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7099 Py_DECREF(rep);
7100 goto error;
7101 }
7102 out = PyBytes_AS_STRING(*outbytes) + offset;
7103 }
7104 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7105 out += outsize;
7106 }
7107 else {
7108 Py_ssize_t i;
7109 enum PyUnicode_Kind kind;
7110 void *data;
7111
Benjamin Petersonbac79492012-01-14 13:34:47 -05007112 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007113 Py_DECREF(rep);
7114 goto error;
7115 }
7116
7117 outsize = PyUnicode_GET_LENGTH(rep);
7118 if (outsize != 1) {
7119 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7120 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7121 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7122 Py_DECREF(rep);
7123 goto error;
7124 }
7125 out = PyBytes_AS_STRING(*outbytes) + offset;
7126 }
7127 kind = PyUnicode_KIND(rep);
7128 data = PyUnicode_DATA(rep);
7129 for (i=0; i < outsize; i++) {
7130 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7131 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007132 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007133 encoding, unicode,
7134 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 "unable to encode error handler result to ASCII");
7136 Py_DECREF(rep);
7137 goto error;
7138 }
7139 *out = (unsigned char)ch;
7140 out++;
7141 }
7142 }
7143 Py_DECREF(rep);
7144 }
7145 /* write a NUL byte */
7146 *out = 0;
7147 outsize = out - PyBytes_AS_STRING(*outbytes);
7148 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7149 if (_PyBytes_Resize(outbytes, outsize) < 0)
7150 goto error;
7151 ret = 0;
7152
7153error:
7154 Py_XDECREF(encoding_obj);
7155 Py_XDECREF(errorHandler);
7156 Py_XDECREF(exc);
7157 return ret;
7158}
7159
Victor Stinner3a50e702011-10-18 21:21:00 +02007160static PyObject *
7161encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007162 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007163 const char *errors)
7164{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007165 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007167 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007168 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007169
Benjamin Petersonbac79492012-01-14 13:34:47 -05007170 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007171 return NULL;
7172 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007173
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 if (code_page < 0) {
7175 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7176 return NULL;
7177 }
7178
Martin v. Löwis3d325192011-11-04 18:23:06 +01007179 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007180 return PyBytes_FromStringAndSize(NULL, 0);
7181
Victor Stinner7581cef2011-11-03 22:32:33 +01007182 offset = 0;
7183 do
7184 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007185#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007186 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007187 chunks. */
7188 if (len > INT_MAX/2) {
7189 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007190 done = 0;
7191 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007192 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007193#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007194 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007195 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007196 done = 1;
7197 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007198
Victor Stinner76a31a62011-11-04 00:05:13 +01007199 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007200 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007201 errors);
7202 if (ret == -2)
7203 ret = encode_code_page_errors(code_page, &outbytes,
7204 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007205 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007206 if (ret < 0) {
7207 Py_XDECREF(outbytes);
7208 return NULL;
7209 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007210
Victor Stinner7581cef2011-11-03 22:32:33 +01007211 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007212 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007213 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007214
Victor Stinner3a50e702011-10-18 21:21:00 +02007215 return outbytes;
7216}
7217
7218PyObject *
7219PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7220 Py_ssize_t size,
7221 const char *errors)
7222{
Victor Stinner7581cef2011-11-03 22:32:33 +01007223 PyObject *unicode, *res;
7224 unicode = PyUnicode_FromUnicode(p, size);
7225 if (unicode == NULL)
7226 return NULL;
7227 res = encode_code_page(CP_ACP, unicode, errors);
7228 Py_DECREF(unicode);
7229 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007230}
7231
7232PyObject *
7233PyUnicode_EncodeCodePage(int code_page,
7234 PyObject *unicode,
7235 const char *errors)
7236{
Victor Stinner7581cef2011-11-03 22:32:33 +01007237 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007238}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007239
Alexander Belopolsky40018472011-02-26 01:02:56 +00007240PyObject *
7241PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007242{
7243 if (!PyUnicode_Check(unicode)) {
7244 PyErr_BadArgument();
7245 return NULL;
7246 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007247 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007248}
7249
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007250#undef NEED_RETRY
7251
Victor Stinner99b95382011-07-04 14:23:54 +02007252#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007253
Guido van Rossumd57fd912000-03-10 22:53:23 +00007254/* --- Character Mapping Codec -------------------------------------------- */
7255
Alexander Belopolsky40018472011-02-26 01:02:56 +00007256PyObject *
7257PyUnicode_DecodeCharmap(const char *s,
7258 Py_ssize_t size,
7259 PyObject *mapping,
7260 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007262 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007263 Py_ssize_t startinpos;
7264 Py_ssize_t endinpos;
7265 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007266 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007267 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007268 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007269 PyObject *errorHandler = NULL;
7270 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007271
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272 /* Default to Latin-1 */
7273 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007274 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007276 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007278 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007280 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007281 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007282 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007283 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007284 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007285 enum PyUnicode_Kind mapkind;
7286 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007287 Py_UCS4 x;
7288
Benjamin Petersonbac79492012-01-14 13:34:47 -05007289 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007290 return NULL;
7291
7292 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007293 mapdata = PyUnicode_DATA(mapping);
7294 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007295 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007296 unsigned char ch;
7297 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7298 enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7299 if (outkind == PyUnicode_1BYTE_KIND) {
7300 void *outdata = PyUnicode_DATA(v);
7301 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7302 while (s < e) {
7303 unsigned char ch = *s;
7304 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7305 if (x > maxchar)
7306 goto Error;
7307 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7308 ++s;
7309 }
7310 break;
7311 }
7312 else if (outkind == PyUnicode_2BYTE_KIND) {
7313 void *outdata = PyUnicode_DATA(v);
7314 while (s < e) {
7315 unsigned char ch = *s;
7316 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7317 if (x == 0xFFFE)
7318 goto Error;
7319 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7320 ++s;
7321 }
7322 break;
7323 }
7324 }
7325 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007326
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007328 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007329 else
7330 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007331Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007332 if (x == 0xfffe)
7333 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007334 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 startinpos = s-starts;
7336 endinpos = startinpos+1;
7337 if (unicode_decode_call_errorhandler(
7338 errors, &errorHandler,
7339 "charmap", "character maps to <undefined>",
7340 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007341 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007342 goto onError;
7343 }
7344 continue;
7345 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007346
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007347 if (unicode_putchar(&v, &outpos, x) < 0)
7348 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007350 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007351 }
7352 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007353 while (s < e) {
7354 unsigned char ch = *s;
7355 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007356
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7358 w = PyLong_FromLong((long)ch);
7359 if (w == NULL)
7360 goto onError;
7361 x = PyObject_GetItem(mapping, w);
7362 Py_DECREF(w);
7363 if (x == NULL) {
7364 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7365 /* No mapping found means: mapping is undefined. */
7366 PyErr_Clear();
7367 x = Py_None;
7368 Py_INCREF(x);
7369 } else
7370 goto onError;
7371 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007372
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 /* Apply mapping */
7374 if (PyLong_Check(x)) {
7375 long value = PyLong_AS_LONG(x);
Antoine Pitroua1f76552012-09-23 20:00:04 +02007376 if (value < 0 || value > MAX_UNICODE) {
7377 PyErr_Format(PyExc_TypeError,
7378 "character mapping must be in range(0x%lx)",
7379 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 Py_DECREF(x);
7381 goto onError;
7382 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007383 if (unicode_putchar(&v, &outpos, value) < 0)
7384 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 }
7386 else if (x == Py_None) {
7387 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007388 startinpos = s-starts;
7389 endinpos = startinpos+1;
7390 if (unicode_decode_call_errorhandler(
7391 errors, &errorHandler,
7392 "charmap", "character maps to <undefined>",
7393 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007394 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007395 Py_DECREF(x);
7396 goto onError;
7397 }
7398 Py_DECREF(x);
7399 continue;
7400 }
7401 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007402 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007403
Benjamin Petersonbac79492012-01-14 13:34:47 -05007404 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007405 goto onError;
7406 targetsize = PyUnicode_GET_LENGTH(x);
7407
7408 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007409 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007410 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007411 PyUnicode_READ_CHAR(x, 0)) < 0)
7412 goto onError;
7413 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007414 else if (targetsize > 1) {
7415 /* 1-n mapping */
7416 if (targetsize > extrachars) {
7417 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 Py_ssize_t needed = (targetsize - extrachars) + \
7419 (targetsize << 2);
7420 extrachars += needed;
7421 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007422 if (unicode_resize(&v,
7423 PyUnicode_GET_LENGTH(v) + needed) < 0)
7424 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007425 Py_DECREF(x);
7426 goto onError;
7427 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007428 }
Victor Stinner1b487b42012-05-03 12:29:04 +02007429 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007430 goto onError;
7431 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7432 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007433 extrachars -= targetsize;
7434 }
7435 /* 1-0 mapping: skip the character */
7436 }
7437 else {
7438 /* wrong return value */
7439 PyErr_SetString(PyExc_TypeError,
7440 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007441 Py_DECREF(x);
7442 goto onError;
7443 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007444 Py_DECREF(x);
7445 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007446 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007447 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007448 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007449 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007450 Py_XDECREF(errorHandler);
7451 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007452 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007453
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007455 Py_XDECREF(errorHandler);
7456 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457 Py_XDECREF(v);
7458 return NULL;
7459}
7460
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007461/* Charmap encoding: the lookup table */
7462
Alexander Belopolsky40018472011-02-26 01:02:56 +00007463struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007464 PyObject_HEAD
7465 unsigned char level1[32];
7466 int count2, count3;
7467 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007468};
7469
7470static PyObject*
7471encoding_map_size(PyObject *obj, PyObject* args)
7472{
7473 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007474 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007476}
7477
7478static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007479 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 PyDoc_STR("Return the size (in bytes) of this object") },
7481 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007482};
7483
7484static void
7485encoding_map_dealloc(PyObject* o)
7486{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007487 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007488}
7489
7490static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007491 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007492 "EncodingMap", /*tp_name*/
7493 sizeof(struct encoding_map), /*tp_basicsize*/
7494 0, /*tp_itemsize*/
7495 /* methods */
7496 encoding_map_dealloc, /*tp_dealloc*/
7497 0, /*tp_print*/
7498 0, /*tp_getattr*/
7499 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007500 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007501 0, /*tp_repr*/
7502 0, /*tp_as_number*/
7503 0, /*tp_as_sequence*/
7504 0, /*tp_as_mapping*/
7505 0, /*tp_hash*/
7506 0, /*tp_call*/
7507 0, /*tp_str*/
7508 0, /*tp_getattro*/
7509 0, /*tp_setattro*/
7510 0, /*tp_as_buffer*/
7511 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7512 0, /*tp_doc*/
7513 0, /*tp_traverse*/
7514 0, /*tp_clear*/
7515 0, /*tp_richcompare*/
7516 0, /*tp_weaklistoffset*/
7517 0, /*tp_iter*/
7518 0, /*tp_iternext*/
7519 encoding_map_methods, /*tp_methods*/
7520 0, /*tp_members*/
7521 0, /*tp_getset*/
7522 0, /*tp_base*/
7523 0, /*tp_dict*/
7524 0, /*tp_descr_get*/
7525 0, /*tp_descr_set*/
7526 0, /*tp_dictoffset*/
7527 0, /*tp_init*/
7528 0, /*tp_alloc*/
7529 0, /*tp_new*/
7530 0, /*tp_free*/
7531 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007532};
7533
7534PyObject*
7535PyUnicode_BuildEncodingMap(PyObject* string)
7536{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007537 PyObject *result;
7538 struct encoding_map *mresult;
7539 int i;
7540 int need_dict = 0;
7541 unsigned char level1[32];
7542 unsigned char level2[512];
7543 unsigned char *mlevel1, *mlevel2, *mlevel3;
7544 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007545 int kind;
7546 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007547 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007548 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007549
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007550 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007551 PyErr_BadArgument();
7552 return NULL;
7553 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007554 kind = PyUnicode_KIND(string);
7555 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007556 length = PyUnicode_GET_LENGTH(string);
7557 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007558 memset(level1, 0xFF, sizeof level1);
7559 memset(level2, 0xFF, sizeof level2);
7560
7561 /* If there isn't a one-to-one mapping of NULL to \0,
7562 or if there are non-BMP characters, we need to use
7563 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007564 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007565 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007566 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007567 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007568 ch = PyUnicode_READ(kind, data, i);
7569 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007570 need_dict = 1;
7571 break;
7572 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007573 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007574 /* unmapped character */
7575 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007576 l1 = ch >> 11;
7577 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007578 if (level1[l1] == 0xFF)
7579 level1[l1] = count2++;
7580 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007581 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007582 }
7583
7584 if (count2 >= 0xFF || count3 >= 0xFF)
7585 need_dict = 1;
7586
7587 if (need_dict) {
7588 PyObject *result = PyDict_New();
7589 PyObject *key, *value;
7590 if (!result)
7591 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007592 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007593 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007594 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007595 if (!key || !value)
7596 goto failed1;
7597 if (PyDict_SetItem(result, key, value) == -1)
7598 goto failed1;
7599 Py_DECREF(key);
7600 Py_DECREF(value);
7601 }
7602 return result;
7603 failed1:
7604 Py_XDECREF(key);
7605 Py_XDECREF(value);
7606 Py_DECREF(result);
7607 return NULL;
7608 }
7609
7610 /* Create a three-level trie */
7611 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7612 16*count2 + 128*count3 - 1);
7613 if (!result)
7614 return PyErr_NoMemory();
7615 PyObject_Init(result, &EncodingMapType);
7616 mresult = (struct encoding_map*)result;
7617 mresult->count2 = count2;
7618 mresult->count3 = count3;
7619 mlevel1 = mresult->level1;
7620 mlevel2 = mresult->level23;
7621 mlevel3 = mresult->level23 + 16*count2;
7622 memcpy(mlevel1, level1, 32);
7623 memset(mlevel2, 0xFF, 16*count2);
7624 memset(mlevel3, 0, 128*count3);
7625 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007626 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007627 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007628 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7629 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007630 /* unmapped character */
7631 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007632 o1 = ch>>11;
7633 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007634 i2 = 16*mlevel1[o1] + o2;
7635 if (mlevel2[i2] == 0xFF)
7636 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007637 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007638 i3 = 128*mlevel2[i2] + o3;
7639 mlevel3[i3] = i;
7640 }
7641 return result;
7642}
7643
7644static int
Victor Stinner22168992011-11-20 17:09:18 +01007645encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007646{
7647 struct encoding_map *map = (struct encoding_map*)mapping;
7648 int l1 = c>>11;
7649 int l2 = (c>>7) & 0xF;
7650 int l3 = c & 0x7F;
7651 int i;
7652
Victor Stinner22168992011-11-20 17:09:18 +01007653 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007655 if (c == 0)
7656 return 0;
7657 /* level 1*/
7658 i = map->level1[l1];
7659 if (i == 0xFF) {
7660 return -1;
7661 }
7662 /* level 2*/
7663 i = map->level23[16*i+l2];
7664 if (i == 0xFF) {
7665 return -1;
7666 }
7667 /* level 3 */
7668 i = map->level23[16*map->count2 + 128*i + l3];
7669 if (i == 0) {
7670 return -1;
7671 }
7672 return i;
7673}
7674
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007675/* Lookup the character ch in the mapping. If the character
7676 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007677 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007678static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007679charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007680{
Christian Heimes217cfd12007-12-02 14:31:20 +00007681 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007682 PyObject *x;
7683
7684 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007686 x = PyObject_GetItem(mapping, w);
7687 Py_DECREF(w);
7688 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007689 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7690 /* No mapping found means: mapping is undefined. */
7691 PyErr_Clear();
7692 x = Py_None;
7693 Py_INCREF(x);
7694 return x;
7695 } else
7696 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007698 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007699 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007700 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007701 long value = PyLong_AS_LONG(x);
7702 if (value < 0 || value > 255) {
7703 PyErr_SetString(PyExc_TypeError,
7704 "character mapping must be in range(256)");
7705 Py_DECREF(x);
7706 return NULL;
7707 }
7708 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007709 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007710 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007711 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007713 /* wrong return value */
7714 PyErr_Format(PyExc_TypeError,
7715 "character mapping must return integer, bytes or None, not %.400s",
7716 x->ob_type->tp_name);
7717 Py_DECREF(x);
7718 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007719 }
7720}
7721
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007722static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007723charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007724{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007725 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7726 /* exponentially overallocate to minimize reallocations */
7727 if (requiredsize < 2*outsize)
7728 requiredsize = 2*outsize;
7729 if (_PyBytes_Resize(outobj, requiredsize))
7730 return -1;
7731 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007732}
7733
Benjamin Peterson14339b62009-01-31 16:36:08 +00007734typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007735 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007736} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007737/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007738 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007739 space is available. Return a new reference to the object that
7740 was put in the output buffer, or Py_None, if the mapping was undefined
7741 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007742 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007743static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007744charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007745 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007746{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007747 PyObject *rep;
7748 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007749 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007750
Christian Heimes90aa7642007-12-19 02:45:37 +00007751 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007752 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007753 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007754 if (res == -1)
7755 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 if (outsize<requiredsize)
7757 if (charmapencode_resize(outobj, outpos, requiredsize))
7758 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007759 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 outstart[(*outpos)++] = (char)res;
7761 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007762 }
7763
7764 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007765 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007766 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007767 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007768 Py_DECREF(rep);
7769 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007770 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 if (PyLong_Check(rep)) {
7772 Py_ssize_t requiredsize = *outpos+1;
7773 if (outsize<requiredsize)
7774 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7775 Py_DECREF(rep);
7776 return enc_EXCEPTION;
7777 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007778 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007780 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007781 else {
7782 const char *repchars = PyBytes_AS_STRING(rep);
7783 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7784 Py_ssize_t requiredsize = *outpos+repsize;
7785 if (outsize<requiredsize)
7786 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7787 Py_DECREF(rep);
7788 return enc_EXCEPTION;
7789 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007790 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 memcpy(outstart + *outpos, repchars, repsize);
7792 *outpos += repsize;
7793 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007794 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007795 Py_DECREF(rep);
7796 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007797}
7798
7799/* handle an error in PyUnicode_EncodeCharmap
7800 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007801static int
7802charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007803 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007804 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007805 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007806 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007807{
7808 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007809 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007810 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007811 enum PyUnicode_Kind kind;
7812 void *data;
7813 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007814 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007815 Py_ssize_t collstartpos = *inpos;
7816 Py_ssize_t collendpos = *inpos+1;
7817 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007818 char *encoding = "charmap";
7819 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007820 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007821 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007822 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007823
Benjamin Petersonbac79492012-01-14 13:34:47 -05007824 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007825 return -1;
7826 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007827 /* find all unencodable characters */
7828 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007829 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007830 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007831 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007832 val = encoding_map_lookup(ch, mapping);
7833 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 break;
7835 ++collendpos;
7836 continue;
7837 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007838
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007839 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7840 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 if (rep==NULL)
7842 return -1;
7843 else if (rep!=Py_None) {
7844 Py_DECREF(rep);
7845 break;
7846 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007847 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007849 }
7850 /* cache callback name lookup
7851 * (if not done yet, i.e. it's the first error) */
7852 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007853 if ((errors==NULL) || (!strcmp(errors, "strict")))
7854 *known_errorHandler = 1;
7855 else if (!strcmp(errors, "replace"))
7856 *known_errorHandler = 2;
7857 else if (!strcmp(errors, "ignore"))
7858 *known_errorHandler = 3;
7859 else if (!strcmp(errors, "xmlcharrefreplace"))
7860 *known_errorHandler = 4;
7861 else
7862 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007863 }
7864 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007865 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007866 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007867 return -1;
7868 case 2: /* replace */
7869 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 x = charmapencode_output('?', mapping, res, respos);
7871 if (x==enc_EXCEPTION) {
7872 return -1;
7873 }
7874 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007875 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007876 return -1;
7877 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007878 }
7879 /* fall through */
7880 case 3: /* ignore */
7881 *inpos = collendpos;
7882 break;
7883 case 4: /* xmlcharrefreplace */
7884 /* generate replacement (temporarily (mis)uses p) */
7885 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007886 char buffer[2+29+1+1];
7887 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007888 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 for (cp = buffer; *cp; ++cp) {
7890 x = charmapencode_output(*cp, mapping, res, respos);
7891 if (x==enc_EXCEPTION)
7892 return -1;
7893 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007894 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 return -1;
7896 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007897 }
7898 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007899 *inpos = collendpos;
7900 break;
7901 default:
7902 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007903 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00007904 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007905 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007907 if (PyBytes_Check(repunicode)) {
7908 /* Directly copy bytes result to output. */
7909 Py_ssize_t outsize = PyBytes_Size(*res);
7910 Py_ssize_t requiredsize;
7911 repsize = PyBytes_Size(repunicode);
7912 requiredsize = *respos + repsize;
7913 if (requiredsize > outsize)
7914 /* Make room for all additional bytes. */
7915 if (charmapencode_resize(res, respos, requiredsize)) {
7916 Py_DECREF(repunicode);
7917 return -1;
7918 }
7919 memcpy(PyBytes_AsString(*res) + *respos,
7920 PyBytes_AsString(repunicode), repsize);
7921 *respos += repsize;
7922 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007923 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007924 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007925 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007926 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05007927 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007928 Py_DECREF(repunicode);
7929 return -1;
7930 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01007931 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007932 data = PyUnicode_DATA(repunicode);
7933 kind = PyUnicode_KIND(repunicode);
7934 for (index = 0; index < repsize; index++) {
7935 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7936 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007937 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007938 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00007939 return -1;
7940 }
7941 else if (x==enc_FAILED) {
7942 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007943 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007944 return -1;
7945 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007946 }
7947 *inpos = newpos;
7948 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007949 }
7950 return 0;
7951}
7952
Alexander Belopolsky40018472011-02-26 01:02:56 +00007953PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007954_PyUnicode_EncodeCharmap(PyObject *unicode,
7955 PyObject *mapping,
7956 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007957{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007958 /* output object */
7959 PyObject *res = NULL;
7960 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007961 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007962 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007963 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007964 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007965 PyObject *errorHandler = NULL;
7966 PyObject *exc = NULL;
7967 /* the following variable is used for caching string comparisons
7968 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7969 * 3=ignore, 4=xmlcharrefreplace */
7970 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007971
Benjamin Petersonbac79492012-01-14 13:34:47 -05007972 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007973 return NULL;
7974 size = PyUnicode_GET_LENGTH(unicode);
7975
Guido van Rossumd57fd912000-03-10 22:53:23 +00007976 /* Default to Latin-1 */
7977 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007978 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007979
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007980 /* allocate enough for a simple encoding without
7981 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007982 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007983 if (res == NULL)
7984 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007985 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007987
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007988 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007989 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007991 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 if (x==enc_EXCEPTION) /* error */
7993 goto onError;
7994 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007995 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 &exc,
7997 &known_errorHandler, &errorHandler, errors,
7998 &res, &respos)) {
7999 goto onError;
8000 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008001 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008002 else
8003 /* done with this character => adjust input position */
8004 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008005 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008006
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008007 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008008 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008009 if (_PyBytes_Resize(&res, respos) < 0)
8010 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008011
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008012 Py_XDECREF(exc);
8013 Py_XDECREF(errorHandler);
8014 return res;
8015
Benjamin Peterson29060642009-01-31 22:14:21 +00008016 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008017 Py_XDECREF(res);
8018 Py_XDECREF(exc);
8019 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008020 return NULL;
8021}
8022
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008023/* Deprecated */
8024PyObject *
8025PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8026 Py_ssize_t size,
8027 PyObject *mapping,
8028 const char *errors)
8029{
8030 PyObject *result;
8031 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8032 if (unicode == NULL)
8033 return NULL;
8034 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8035 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008036 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008037}
8038
Alexander Belopolsky40018472011-02-26 01:02:56 +00008039PyObject *
8040PyUnicode_AsCharmapString(PyObject *unicode,
8041 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008042{
8043 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 PyErr_BadArgument();
8045 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008046 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008047 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008048}
8049
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008050/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008051static void
8052make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008053 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008054 Py_ssize_t startpos, Py_ssize_t endpos,
8055 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008057 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008058 *exceptionObject = _PyUnicodeTranslateError_Create(
8059 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060 }
8061 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008062 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8063 goto onError;
8064 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8065 goto onError;
8066 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8067 goto onError;
8068 return;
8069 onError:
8070 Py_DECREF(*exceptionObject);
8071 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008072 }
8073}
8074
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008075/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008076static void
8077raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008078 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008079 Py_ssize_t startpos, Py_ssize_t endpos,
8080 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008081{
8082 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008083 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008084 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008086}
8087
8088/* error handling callback helper:
8089 build arguments, call the callback and check the arguments,
8090 put the result into newpos and return the replacement string, which
8091 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008092static PyObject *
8093unicode_translate_call_errorhandler(const char *errors,
8094 PyObject **errorHandler,
8095 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008096 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008097 Py_ssize_t startpos, Py_ssize_t endpos,
8098 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008099{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008100 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008101
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008102 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008103 PyObject *restuple;
8104 PyObject *resunicode;
8105
8106 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008108 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008109 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008110 }
8111
8112 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008113 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008114 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008116
8117 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008119 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008122 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 Py_DECREF(restuple);
8124 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008125 }
8126 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 &resunicode, &i_newpos)) {
8128 Py_DECREF(restuple);
8129 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008130 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008131 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008132 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008133 else
8134 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008135 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8137 Py_DECREF(restuple);
8138 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008139 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 Py_INCREF(resunicode);
8141 Py_DECREF(restuple);
8142 return resunicode;
8143}
8144
8145/* Lookup the character ch in the mapping and put the result in result,
8146 which must be decrefed by the caller.
8147 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008148static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008149charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008150{
Christian Heimes217cfd12007-12-02 14:31:20 +00008151 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008152 PyObject *x;
8153
8154 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008156 x = PyObject_GetItem(mapping, w);
8157 Py_DECREF(w);
8158 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8160 /* No mapping found means: use 1:1 mapping. */
8161 PyErr_Clear();
8162 *result = NULL;
8163 return 0;
8164 } else
8165 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008166 }
8167 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 *result = x;
8169 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008170 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008171 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 long value = PyLong_AS_LONG(x);
8173 long max = PyUnicode_GetMax();
8174 if (value < 0 || value > max) {
8175 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008176 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008177 Py_DECREF(x);
8178 return -1;
8179 }
8180 *result = x;
8181 return 0;
8182 }
8183 else if (PyUnicode_Check(x)) {
8184 *result = x;
8185 return 0;
8186 }
8187 else {
8188 /* wrong return value */
8189 PyErr_SetString(PyExc_TypeError,
8190 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008191 Py_DECREF(x);
8192 return -1;
8193 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008194}
8195/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 if not reallocate and adjust various state variables.
8197 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008198static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008199charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008200 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008201{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008202 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008203 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008204 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008205 /* exponentially overallocate to minimize reallocations */
8206 if (requiredsize < 2 * oldsize)
8207 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008208 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8209 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008211 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008212 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008213 }
8214 return 0;
8215}
8216/* lookup the character, put the result in the output string and adjust
8217 various state variables. Return a new reference to the object that
8218 was put in the output buffer in *result, or Py_None, if the mapping was
8219 undefined (in which case no character was written).
8220 The called must decref result.
8221 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008222static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008223charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8224 PyObject *mapping, Py_UCS4 **output,
8225 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008226 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008227{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008228 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8229 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008231 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008233 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008234 }
8235 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008237 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008239 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240 }
8241 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008242 Py_ssize_t repsize;
8243 if (PyUnicode_READY(*res) == -1)
8244 return -1;
8245 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 if (repsize==1) {
8247 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008248 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 }
8250 else if (repsize!=0) {
8251 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008252 Py_ssize_t requiredsize = *opos +
8253 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008255 Py_ssize_t i;
8256 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008258 for(i = 0; i < repsize; i++)
8259 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008260 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008261 }
8262 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008263 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 return 0;
8265}
8266
Alexander Belopolsky40018472011-02-26 01:02:56 +00008267PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008268_PyUnicode_TranslateCharmap(PyObject *input,
8269 PyObject *mapping,
8270 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008272 /* input object */
8273 char *idata;
8274 Py_ssize_t size, i;
8275 int kind;
8276 /* output buffer */
8277 Py_UCS4 *output = NULL;
8278 Py_ssize_t osize;
8279 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008280 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008281 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008282 char *reason = "character maps to <undefined>";
8283 PyObject *errorHandler = NULL;
8284 PyObject *exc = NULL;
8285 /* the following variable is used for caching string comparisons
8286 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8287 * 3=ignore, 4=xmlcharrefreplace */
8288 int known_errorHandler = -1;
8289
Guido van Rossumd57fd912000-03-10 22:53:23 +00008290 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 PyErr_BadArgument();
8292 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008293 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008295 if (PyUnicode_READY(input) == -1)
8296 return NULL;
8297 idata = (char*)PyUnicode_DATA(input);
8298 kind = PyUnicode_KIND(input);
8299 size = PyUnicode_GET_LENGTH(input);
8300 i = 0;
8301
8302 if (size == 0) {
8303 Py_INCREF(input);
8304 return input;
8305 }
8306
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008307 /* allocate enough for a simple 1:1 translation without
8308 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008309 osize = size;
8310 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8311 opos = 0;
8312 if (output == NULL) {
8313 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008315 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008317 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 /* try to encode it */
8319 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008320 if (charmaptranslate_output(input, i, mapping,
8321 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 Py_XDECREF(x);
8323 goto onError;
8324 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008325 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008327 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008328 else { /* untranslatable character */
8329 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8330 Py_ssize_t repsize;
8331 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008332 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008333 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008334 Py_ssize_t collstart = i;
8335 Py_ssize_t collend = i+1;
8336 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008339 while (collend < size) {
8340 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 goto onError;
8342 Py_XDECREF(x);
8343 if (x!=Py_None)
8344 break;
8345 ++collend;
8346 }
8347 /* cache callback name lookup
8348 * (if not done yet, i.e. it's the first error) */
8349 if (known_errorHandler==-1) {
8350 if ((errors==NULL) || (!strcmp(errors, "strict")))
8351 known_errorHandler = 1;
8352 else if (!strcmp(errors, "replace"))
8353 known_errorHandler = 2;
8354 else if (!strcmp(errors, "ignore"))
8355 known_errorHandler = 3;
8356 else if (!strcmp(errors, "xmlcharrefreplace"))
8357 known_errorHandler = 4;
8358 else
8359 known_errorHandler = 0;
8360 }
8361 switch (known_errorHandler) {
8362 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008363 raise_translate_exception(&exc, input, collstart,
8364 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008365 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 case 2: /* replace */
8367 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008368 for (coll = collstart; coll<collend; coll++)
8369 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 /* fall through */
8371 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 break;
8374 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 /* generate replacement (temporarily (mis)uses i) */
8376 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 char buffer[2+29+1+1];
8378 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008379 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8380 if (charmaptranslate_makespace(&output, &osize,
8381 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 goto onError;
8383 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008384 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 break;
8388 default:
8389 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008390 reason, input, &exc,
8391 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008392 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008394 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008395 Py_DECREF(repunicode);
8396 goto onError;
8397 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008399 repsize = PyUnicode_GET_LENGTH(repunicode);
8400 if (charmaptranslate_makespace(&output, &osize,
8401 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 Py_DECREF(repunicode);
8403 goto onError;
8404 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008405 for (uni2 = 0; repsize-->0; ++uni2)
8406 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8407 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008409 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008410 }
8411 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008412 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8413 if (!res)
8414 goto onError;
8415 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416 Py_XDECREF(exc);
8417 Py_XDECREF(errorHandler);
8418 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008419
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008421 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 Py_XDECREF(exc);
8423 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008424 return NULL;
8425}
8426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008427/* Deprecated. Use PyUnicode_Translate instead. */
8428PyObject *
8429PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8430 Py_ssize_t size,
8431 PyObject *mapping,
8432 const char *errors)
8433{
Christian Heimes5f520f42012-09-11 14:03:25 +02008434 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008435 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8436 if (!unicode)
8437 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008438 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8439 Py_DECREF(unicode);
8440 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008441}
8442
Alexander Belopolsky40018472011-02-26 01:02:56 +00008443PyObject *
8444PyUnicode_Translate(PyObject *str,
8445 PyObject *mapping,
8446 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447{
8448 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008449
Guido van Rossumd57fd912000-03-10 22:53:23 +00008450 str = PyUnicode_FromObject(str);
8451 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008452 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008453 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008454 Py_DECREF(str);
8455 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008456}
Tim Petersced69f82003-09-16 20:30:58 +00008457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008458static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008459fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460{
8461 /* No need to call PyUnicode_READY(self) because this function is only
8462 called as a callback from fixup() which does it already. */
8463 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8464 const int kind = PyUnicode_KIND(self);
8465 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008466 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008467 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468 Py_ssize_t i;
8469
8470 for (i = 0; i < len; ++i) {
8471 ch = PyUnicode_READ(kind, data, i);
8472 fixed = 0;
8473 if (ch > 127) {
8474 if (Py_UNICODE_ISSPACE(ch))
8475 fixed = ' ';
8476 else {
8477 const int decimal = Py_UNICODE_TODECIMAL(ch);
8478 if (decimal >= 0)
8479 fixed = '0' + decimal;
8480 }
8481 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008482 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008483 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008484 PyUnicode_WRITE(kind, data, i, fixed);
8485 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008486 else
8487 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008488 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008489 }
8490
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008491 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008492}
8493
8494PyObject *
8495_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8496{
8497 if (!PyUnicode_Check(unicode)) {
8498 PyErr_BadInternalCall();
8499 return NULL;
8500 }
8501 if (PyUnicode_READY(unicode) == -1)
8502 return NULL;
8503 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8504 /* If the string is already ASCII, just return the same string */
8505 Py_INCREF(unicode);
8506 return unicode;
8507 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008508 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509}
8510
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008511PyObject *
8512PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8513 Py_ssize_t length)
8514{
Victor Stinnerf0124502011-11-21 23:12:56 +01008515 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008516 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008517 Py_UCS4 maxchar;
8518 enum PyUnicode_Kind kind;
8519 void *data;
8520
Victor Stinner99d7ad02012-02-22 13:37:39 +01008521 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008522 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008523 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008524 if (ch > 127) {
8525 int decimal = Py_UNICODE_TODECIMAL(ch);
8526 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008527 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008528 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008529 }
8530 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008531
8532 /* Copy to a new string */
8533 decimal = PyUnicode_New(length, maxchar);
8534 if (decimal == NULL)
8535 return decimal;
8536 kind = PyUnicode_KIND(decimal);
8537 data = PyUnicode_DATA(decimal);
8538 /* Iterate over code points */
8539 for (i = 0; i < length; i++) {
8540 Py_UNICODE ch = s[i];
8541 if (ch > 127) {
8542 int decimal = Py_UNICODE_TODECIMAL(ch);
8543 if (decimal >= 0)
8544 ch = '0' + decimal;
8545 }
8546 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008547 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008548 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008549}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008550/* --- Decimal Encoder ---------------------------------------------------- */
8551
Alexander Belopolsky40018472011-02-26 01:02:56 +00008552int
8553PyUnicode_EncodeDecimal(Py_UNICODE *s,
8554 Py_ssize_t length,
8555 char *output,
8556 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008557{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008558 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008559 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008560 enum PyUnicode_Kind kind;
8561 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008562
8563 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 PyErr_BadArgument();
8565 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008566 }
8567
Victor Stinner42bf7752011-11-21 22:52:58 +01008568 unicode = PyUnicode_FromUnicode(s, length);
8569 if (unicode == NULL)
8570 return -1;
8571
Benjamin Petersonbac79492012-01-14 13:34:47 -05008572 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008573 Py_DECREF(unicode);
8574 return -1;
8575 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008576 kind = PyUnicode_KIND(unicode);
8577 data = PyUnicode_DATA(unicode);
8578
Victor Stinnerb84d7232011-11-22 01:50:07 +01008579 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008580 PyObject *exc;
8581 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008583 Py_ssize_t startpos;
8584
8585 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008586
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008588 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008589 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008591 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008592 decimal = Py_UNICODE_TODECIMAL(ch);
8593 if (decimal >= 0) {
8594 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008595 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 continue;
8597 }
8598 if (0 < ch && ch < 256) {
8599 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008600 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 continue;
8602 }
Victor Stinner6345be92011-11-25 20:09:01 +01008603
Victor Stinner42bf7752011-11-21 22:52:58 +01008604 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008605 exc = NULL;
8606 raise_encode_exception(&exc, "decimal", unicode,
8607 startpos, startpos+1,
8608 "invalid decimal Unicode string");
8609 Py_XDECREF(exc);
8610 Py_DECREF(unicode);
8611 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008612 }
8613 /* 0-terminate the output string */
8614 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008615 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008616 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008617}
8618
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619/* --- Helpers ------------------------------------------------------------ */
8620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008622any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 Py_ssize_t start,
8624 Py_ssize_t end)
8625{
8626 int kind1, kind2, kind;
8627 void *buf1, *buf2;
8628 Py_ssize_t len1, len2, result;
8629
8630 kind1 = PyUnicode_KIND(s1);
8631 kind2 = PyUnicode_KIND(s2);
8632 kind = kind1 > kind2 ? kind1 : kind2;
8633 buf1 = PyUnicode_DATA(s1);
8634 buf2 = PyUnicode_DATA(s2);
8635 if (kind1 != kind)
8636 buf1 = _PyUnicode_AsKind(s1, kind);
8637 if (!buf1)
8638 return -2;
8639 if (kind2 != kind)
8640 buf2 = _PyUnicode_AsKind(s2, kind);
8641 if (!buf2) {
8642 if (kind1 != kind) PyMem_Free(buf1);
8643 return -2;
8644 }
8645 len1 = PyUnicode_GET_LENGTH(s1);
8646 len2 = PyUnicode_GET_LENGTH(s2);
8647
Victor Stinner794d5672011-10-10 03:21:36 +02008648 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008649 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008650 case PyUnicode_1BYTE_KIND:
8651 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8652 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8653 else
8654 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8655 break;
8656 case PyUnicode_2BYTE_KIND:
8657 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8658 break;
8659 case PyUnicode_4BYTE_KIND:
8660 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8661 break;
8662 default:
8663 assert(0); result = -2;
8664 }
8665 }
8666 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008667 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008668 case PyUnicode_1BYTE_KIND:
8669 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8670 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8671 else
8672 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8673 break;
8674 case PyUnicode_2BYTE_KIND:
8675 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8676 break;
8677 case PyUnicode_4BYTE_KIND:
8678 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8679 break;
8680 default:
8681 assert(0); result = -2;
8682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008683 }
8684
8685 if (kind1 != kind)
8686 PyMem_Free(buf1);
8687 if (kind2 != kind)
8688 PyMem_Free(buf2);
8689
8690 return result;
8691}
8692
8693Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008694_PyUnicode_InsertThousandsGrouping(
8695 PyObject *unicode, Py_ssize_t index,
8696 Py_ssize_t n_buffer,
8697 void *digits, Py_ssize_t n_digits,
8698 Py_ssize_t min_width,
8699 const char *grouping, PyObject *thousands_sep,
8700 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701{
Victor Stinner41a863c2012-02-24 00:37:51 +01008702 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008703 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008704 Py_ssize_t thousands_sep_len;
8705 Py_ssize_t len;
8706
8707 if (unicode != NULL) {
8708 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008709 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008710 }
8711 else {
8712 kind = PyUnicode_1BYTE_KIND;
8713 data = NULL;
8714 }
8715 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8716 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8717 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8718 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008719 if (thousands_sep_kind < kind) {
8720 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8721 if (!thousands_sep_data)
8722 return -1;
8723 }
8724 else {
8725 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8726 if (!data)
8727 return -1;
8728 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008729 }
8730
Benjamin Petersonead6b532011-12-20 17:23:42 -06008731 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008732 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008733 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008734 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008735 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008736 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008737 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008738 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008739 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008740 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008741 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008742 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008743 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008744 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008745 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008746 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008747 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008748 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008749 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008750 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008751 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008752 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008753 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008754 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008755 break;
8756 default:
8757 assert(0);
8758 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008760 if (unicode != NULL && thousands_sep_kind != kind) {
8761 if (thousands_sep_kind < kind)
8762 PyMem_Free(thousands_sep_data);
8763 else
8764 PyMem_Free(data);
8765 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008766 if (unicode == NULL) {
8767 *maxchar = 127;
8768 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008769 *maxchar = MAX_MAXCHAR(*maxchar,
8770 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008771 }
8772 }
8773 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774}
8775
8776
Thomas Wouters477c8d52006-05-27 19:21:47 +00008777/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008778#define ADJUST_INDICES(start, end, len) \
8779 if (end > len) \
8780 end = len; \
8781 else if (end < 0) { \
8782 end += len; \
8783 if (end < 0) \
8784 end = 0; \
8785 } \
8786 if (start < 0) { \
8787 start += len; \
8788 if (start < 0) \
8789 start = 0; \
8790 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008791
Alexander Belopolsky40018472011-02-26 01:02:56 +00008792Py_ssize_t
8793PyUnicode_Count(PyObject *str,
8794 PyObject *substr,
8795 Py_ssize_t start,
8796 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008797{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008798 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008799 PyObject* str_obj;
8800 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008801 int kind1, kind2, kind;
8802 void *buf1 = NULL, *buf2 = NULL;
8803 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008804
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008805 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008806 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008807 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008808 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008809 if (!sub_obj) {
8810 Py_DECREF(str_obj);
8811 return -1;
8812 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008813 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008814 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008815 Py_DECREF(str_obj);
8816 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008817 }
Tim Petersced69f82003-09-16 20:30:58 +00008818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819 kind1 = PyUnicode_KIND(str_obj);
8820 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008821 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008823 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008824 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008825 if (kind2 > kind) {
8826 Py_DECREF(sub_obj);
8827 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008828 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008829 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008830 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008831 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008832 if (!buf2)
8833 goto onError;
8834 len1 = PyUnicode_GET_LENGTH(str_obj);
8835 len2 = PyUnicode_GET_LENGTH(sub_obj);
8836
8837 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008838 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008839 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008840 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8841 result = asciilib_count(
8842 ((Py_UCS1*)buf1) + start, end - start,
8843 buf2, len2, PY_SSIZE_T_MAX
8844 );
8845 else
8846 result = ucs1lib_count(
8847 ((Py_UCS1*)buf1) + start, end - start,
8848 buf2, len2, PY_SSIZE_T_MAX
8849 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850 break;
8851 case PyUnicode_2BYTE_KIND:
8852 result = ucs2lib_count(
8853 ((Py_UCS2*)buf1) + start, end - start,
8854 buf2, len2, PY_SSIZE_T_MAX
8855 );
8856 break;
8857 case PyUnicode_4BYTE_KIND:
8858 result = ucs4lib_count(
8859 ((Py_UCS4*)buf1) + start, end - start,
8860 buf2, len2, PY_SSIZE_T_MAX
8861 );
8862 break;
8863 default:
8864 assert(0); result = 0;
8865 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008866
8867 Py_DECREF(sub_obj);
8868 Py_DECREF(str_obj);
8869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008870 if (kind2 != kind)
8871 PyMem_Free(buf2);
8872
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008874 onError:
8875 Py_DECREF(sub_obj);
8876 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008877 if (kind2 != kind && buf2)
8878 PyMem_Free(buf2);
8879 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880}
8881
Alexander Belopolsky40018472011-02-26 01:02:56 +00008882Py_ssize_t
8883PyUnicode_Find(PyObject *str,
8884 PyObject *sub,
8885 Py_ssize_t start,
8886 Py_ssize_t end,
8887 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008889 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008890
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008892 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008894 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008895 if (!sub) {
8896 Py_DECREF(str);
8897 return -2;
8898 }
8899 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8900 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 Py_DECREF(str);
8902 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903 }
Tim Petersced69f82003-09-16 20:30:58 +00008904
Victor Stinner794d5672011-10-10 03:21:36 +02008905 result = any_find_slice(direction,
8906 str, sub, start, end
8907 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008908
Guido van Rossumd57fd912000-03-10 22:53:23 +00008909 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008910 Py_DECREF(sub);
8911
Guido van Rossumd57fd912000-03-10 22:53:23 +00008912 return result;
8913}
8914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008915Py_ssize_t
8916PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8917 Py_ssize_t start, Py_ssize_t end,
8918 int direction)
8919{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008920 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008921 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922 if (PyUnicode_READY(str) == -1)
8923 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008924 if (start < 0 || end < 0) {
8925 PyErr_SetString(PyExc_IndexError, "string index out of range");
8926 return -2;
8927 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008928 if (end > PyUnicode_GET_LENGTH(str))
8929 end = PyUnicode_GET_LENGTH(str);
8930 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008931 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8932 kind, end-start, ch, direction);
8933 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008934 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008935 else
8936 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008937}
8938
Alexander Belopolsky40018472011-02-26 01:02:56 +00008939static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008940tailmatch(PyObject *self,
8941 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008942 Py_ssize_t start,
8943 Py_ssize_t end,
8944 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946 int kind_self;
8947 int kind_sub;
8948 void *data_self;
8949 void *data_sub;
8950 Py_ssize_t offset;
8951 Py_ssize_t i;
8952 Py_ssize_t end_sub;
8953
8954 if (PyUnicode_READY(self) == -1 ||
8955 PyUnicode_READY(substring) == -1)
8956 return 0;
8957
8958 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008959 return 1;
8960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8962 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008964 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008965
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 kind_self = PyUnicode_KIND(self);
8967 data_self = PyUnicode_DATA(self);
8968 kind_sub = PyUnicode_KIND(substring);
8969 data_sub = PyUnicode_DATA(substring);
8970 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8971
8972 if (direction > 0)
8973 offset = end;
8974 else
8975 offset = start;
8976
8977 if (PyUnicode_READ(kind_self, data_self, offset) ==
8978 PyUnicode_READ(kind_sub, data_sub, 0) &&
8979 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8980 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8981 /* If both are of the same kind, memcmp is sufficient */
8982 if (kind_self == kind_sub) {
8983 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008984 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 data_sub,
8986 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008987 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008988 }
8989 /* otherwise we have to compare each character by first accesing it */
8990 else {
8991 /* We do not need to compare 0 and len(substring)-1 because
8992 the if statement above ensured already that they are equal
8993 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02008994 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008995 for (i = 1; i < end_sub; ++i) {
8996 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8997 PyUnicode_READ(kind_sub, data_sub, i))
8998 return 0;
8999 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009000 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009002 }
9003
9004 return 0;
9005}
9006
Alexander Belopolsky40018472011-02-26 01:02:56 +00009007Py_ssize_t
9008PyUnicode_Tailmatch(PyObject *str,
9009 PyObject *substr,
9010 Py_ssize_t start,
9011 Py_ssize_t end,
9012 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009014 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009015
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016 str = PyUnicode_FromObject(str);
9017 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009018 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 substr = PyUnicode_FromObject(substr);
9020 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009021 Py_DECREF(str);
9022 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023 }
Tim Petersced69f82003-09-16 20:30:58 +00009024
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009025 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009026 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027 Py_DECREF(str);
9028 Py_DECREF(substr);
9029 return result;
9030}
9031
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032/* Apply fixfct filter to the Unicode object self and return a
9033 reference to the modified object */
9034
Alexander Belopolsky40018472011-02-26 01:02:56 +00009035static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009036fixup(PyObject *self,
9037 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039 PyObject *u;
9040 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009041 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009043 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009045 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009046 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009048 /* fix functions return the new maximum character in a string,
9049 if the kind of the resulting unicode object does not change,
9050 everything is fine. Otherwise we need to change the string kind
9051 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009052 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009053
9054 if (maxchar_new == 0) {
9055 /* no changes */;
9056 if (PyUnicode_CheckExact(self)) {
9057 Py_DECREF(u);
9058 Py_INCREF(self);
9059 return self;
9060 }
9061 else
9062 return u;
9063 }
9064
Victor Stinnere6abb482012-05-02 01:15:40 +02009065 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066
Victor Stinnereaab6042011-12-11 22:22:39 +01009067 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009069
9070 /* In case the maximum character changed, we need to
9071 convert the string to the new category. */
9072 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9073 if (v == NULL) {
9074 Py_DECREF(u);
9075 return NULL;
9076 }
9077 if (maxchar_new > maxchar_old) {
9078 /* If the maxchar increased so that the kind changed, not all
9079 characters are representable anymore and we need to fix the
9080 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009081 _PyUnicode_FastCopyCharacters(v, 0,
9082 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009083 maxchar_old = fixfct(v);
9084 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 }
9086 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009087 _PyUnicode_FastCopyCharacters(v, 0,
9088 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009089 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009090 Py_DECREF(u);
9091 assert(_PyUnicode_CheckConsistency(v, 1));
9092 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093}
9094
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009095static PyObject *
9096ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009098 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9099 char *resdata, *data = PyUnicode_DATA(self);
9100 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009101
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009102 res = PyUnicode_New(len, 127);
9103 if (res == NULL)
9104 return NULL;
9105 resdata = PyUnicode_DATA(res);
9106 if (lower)
9107 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009108 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009109 _Py_bytes_upper(resdata, data, len);
9110 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111}
9112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009113static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009114handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009115{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009116 Py_ssize_t j;
9117 int final_sigma;
9118 Py_UCS4 c;
9119 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009120
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009121 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9122
9123 where ! is a negation and \p{xxx} is a character with property xxx.
9124 */
9125 for (j = i - 1; j >= 0; j--) {
9126 c = PyUnicode_READ(kind, data, j);
9127 if (!_PyUnicode_IsCaseIgnorable(c))
9128 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009129 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009130 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9131 if (final_sigma) {
9132 for (j = i + 1; j < length; j++) {
9133 c = PyUnicode_READ(kind, data, j);
9134 if (!_PyUnicode_IsCaseIgnorable(c))
9135 break;
9136 }
9137 final_sigma = j == length || !_PyUnicode_IsCased(c);
9138 }
9139 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140}
9141
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009142static int
9143lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9144 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009146 /* Obscure special case. */
9147 if (c == 0x3A3) {
9148 mapped[0] = handle_capital_sigma(kind, data, length, i);
9149 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009151 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152}
9153
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009154static Py_ssize_t
9155do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009157 Py_ssize_t i, k = 0;
9158 int n_res, j;
9159 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009160
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009161 c = PyUnicode_READ(kind, data, 0);
9162 n_res = _PyUnicode_ToUpperFull(c, mapped);
9163 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009164 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009165 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009166 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009167 for (i = 1; i < length; i++) {
9168 c = PyUnicode_READ(kind, data, i);
9169 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9170 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009171 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009172 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009173 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009174 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009175 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009176}
9177
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009178static Py_ssize_t
9179do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9180 Py_ssize_t i, k = 0;
9181
9182 for (i = 0; i < length; i++) {
9183 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9184 int n_res, j;
9185 if (Py_UNICODE_ISUPPER(c)) {
9186 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9187 }
9188 else if (Py_UNICODE_ISLOWER(c)) {
9189 n_res = _PyUnicode_ToUpperFull(c, mapped);
9190 }
9191 else {
9192 n_res = 1;
9193 mapped[0] = c;
9194 }
9195 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009196 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009197 res[k++] = mapped[j];
9198 }
9199 }
9200 return k;
9201}
9202
9203static Py_ssize_t
9204do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9205 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009206{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009207 Py_ssize_t i, k = 0;
9208
9209 for (i = 0; i < length; i++) {
9210 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9211 int n_res, j;
9212 if (lower)
9213 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9214 else
9215 n_res = _PyUnicode_ToUpperFull(c, mapped);
9216 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009217 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009218 res[k++] = mapped[j];
9219 }
9220 }
9221 return k;
9222}
9223
9224static Py_ssize_t
9225do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9226{
9227 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9228}
9229
9230static Py_ssize_t
9231do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9232{
9233 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9234}
9235
Benjamin Petersone51757f2012-01-12 21:10:29 -05009236static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009237do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9238{
9239 Py_ssize_t i, k = 0;
9240
9241 for (i = 0; i < length; i++) {
9242 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9243 Py_UCS4 mapped[3];
9244 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9245 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009246 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009247 res[k++] = mapped[j];
9248 }
9249 }
9250 return k;
9251}
9252
9253static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009254do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9255{
9256 Py_ssize_t i, k = 0;
9257 int previous_is_cased;
9258
9259 previous_is_cased = 0;
9260 for (i = 0; i < length; i++) {
9261 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9262 Py_UCS4 mapped[3];
9263 int n_res, j;
9264
9265 if (previous_is_cased)
9266 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9267 else
9268 n_res = _PyUnicode_ToTitleFull(c, mapped);
9269
9270 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009271 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009272 res[k++] = mapped[j];
9273 }
9274
9275 previous_is_cased = _PyUnicode_IsCased(c);
9276 }
9277 return k;
9278}
9279
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009280static PyObject *
9281case_operation(PyObject *self,
9282 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9283{
9284 PyObject *res = NULL;
9285 Py_ssize_t length, newlength = 0;
9286 int kind, outkind;
9287 void *data, *outdata;
9288 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9289
Benjamin Petersoneea48462012-01-16 14:28:50 -05009290 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009291
9292 kind = PyUnicode_KIND(self);
9293 data = PyUnicode_DATA(self);
9294 length = PyUnicode_GET_LENGTH(self);
9295 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9296 if (tmp == NULL)
9297 return PyErr_NoMemory();
9298 newlength = perform(kind, data, length, tmp, &maxchar);
9299 res = PyUnicode_New(newlength, maxchar);
9300 if (res == NULL)
9301 goto leave;
9302 tmpend = tmp + newlength;
9303 outdata = PyUnicode_DATA(res);
9304 outkind = PyUnicode_KIND(res);
9305 switch (outkind) {
9306 case PyUnicode_1BYTE_KIND:
9307 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9308 break;
9309 case PyUnicode_2BYTE_KIND:
9310 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9311 break;
9312 case PyUnicode_4BYTE_KIND:
9313 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9314 break;
9315 default:
9316 assert(0);
9317 break;
9318 }
9319 leave:
9320 PyMem_FREE(tmp);
9321 return res;
9322}
9323
Tim Peters8ce9f162004-08-27 01:49:32 +00009324PyObject *
9325PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009326{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009327 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009328 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009330 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009331 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9332 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009333 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009334 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009335 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009337 int use_memcpy;
9338 unsigned char *res_data = NULL, *sep_data = NULL;
9339 PyObject *last_obj;
9340 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009341
Tim Peters05eba1f2004-08-27 21:32:02 +00009342 fseq = PySequence_Fast(seq, "");
9343 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009344 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009345 }
9346
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009347 /* NOTE: the following code can't call back into Python code,
9348 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009349 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009350
Tim Peters05eba1f2004-08-27 21:32:02 +00009351 seqlen = PySequence_Fast_GET_SIZE(fseq);
9352 /* If empty sequence, return u"". */
9353 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009354 Py_DECREF(fseq);
9355 Py_INCREF(unicode_empty);
9356 res = unicode_empty;
9357 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009358 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009359
Tim Peters05eba1f2004-08-27 21:32:02 +00009360 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009361 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009362 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009363 if (seqlen == 1) {
9364 if (PyUnicode_CheckExact(items[0])) {
9365 res = items[0];
9366 Py_INCREF(res);
9367 Py_DECREF(fseq);
9368 return res;
9369 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009370 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009371 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009372 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009373 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009374 /* Set up sep and seplen */
9375 if (separator == NULL) {
9376 /* fall back to a blank space separator */
9377 sep = PyUnicode_FromOrdinal(' ');
9378 if (!sep)
9379 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009380 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009381 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009382 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009383 else {
9384 if (!PyUnicode_Check(separator)) {
9385 PyErr_Format(PyExc_TypeError,
9386 "separator: expected str instance,"
9387 " %.80s found",
9388 Py_TYPE(separator)->tp_name);
9389 goto onError;
9390 }
9391 if (PyUnicode_READY(separator))
9392 goto onError;
9393 sep = separator;
9394 seplen = PyUnicode_GET_LENGTH(separator);
9395 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9396 /* inc refcount to keep this code path symmetric with the
9397 above case of a blank separator */
9398 Py_INCREF(sep);
9399 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009400 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009401 }
9402
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009403 /* There are at least two things to join, or else we have a subclass
9404 * of str in the sequence.
9405 * Do a pre-pass to figure out the total amount of space we'll
9406 * need (sz), and see whether all argument are strings.
9407 */
9408 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009409#ifdef Py_DEBUG
9410 use_memcpy = 0;
9411#else
9412 use_memcpy = 1;
9413#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009414 for (i = 0; i < seqlen; i++) {
9415 const Py_ssize_t old_sz = sz;
9416 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009417 if (!PyUnicode_Check(item)) {
9418 PyErr_Format(PyExc_TypeError,
9419 "sequence item %zd: expected str instance,"
9420 " %.80s found",
9421 i, Py_TYPE(item)->tp_name);
9422 goto onError;
9423 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009424 if (PyUnicode_READY(item) == -1)
9425 goto onError;
9426 sz += PyUnicode_GET_LENGTH(item);
9427 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009428 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009429 if (i != 0)
9430 sz += seplen;
9431 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9432 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009433 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009434 goto onError;
9435 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009436 if (use_memcpy && last_obj != NULL) {
9437 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9438 use_memcpy = 0;
9439 }
9440 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009441 }
Tim Petersced69f82003-09-16 20:30:58 +00009442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009444 if (res == NULL)
9445 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009446
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009447 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009448#ifdef Py_DEBUG
9449 use_memcpy = 0;
9450#else
9451 if (use_memcpy) {
9452 res_data = PyUnicode_1BYTE_DATA(res);
9453 kind = PyUnicode_KIND(res);
9454 if (seplen != 0)
9455 sep_data = PyUnicode_1BYTE_DATA(sep);
9456 }
9457#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009459 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009460 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009461 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009462 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009463 if (use_memcpy) {
9464 Py_MEMCPY(res_data,
9465 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009466 kind * seplen);
9467 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009468 }
9469 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009470 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009471 res_offset += seplen;
9472 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009473 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009474 itemlen = PyUnicode_GET_LENGTH(item);
9475 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009476 if (use_memcpy) {
9477 Py_MEMCPY(res_data,
9478 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009479 kind * itemlen);
9480 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009481 }
9482 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009483 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009484 res_offset += itemlen;
9485 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009486 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009487 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009488 if (use_memcpy)
9489 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009490 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009491 else
9492 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009493
Tim Peters05eba1f2004-08-27 21:32:02 +00009494 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009496 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009498
Benjamin Peterson29060642009-01-31 22:14:21 +00009499 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009500 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009501 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009502 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503 return NULL;
9504}
9505
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506#define FILL(kind, data, value, start, length) \
9507 do { \
9508 Py_ssize_t i_ = 0; \
9509 assert(kind != PyUnicode_WCHAR_KIND); \
9510 switch ((kind)) { \
9511 case PyUnicode_1BYTE_KIND: { \
9512 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009513 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 break; \
9515 } \
9516 case PyUnicode_2BYTE_KIND: { \
9517 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9518 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9519 break; \
9520 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009521 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9523 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9524 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009525 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526 } \
9527 } \
9528 } while (0)
9529
Victor Stinnerd3f08822012-05-29 12:57:52 +02009530void
9531_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9532 Py_UCS4 fill_char)
9533{
9534 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9535 const void *data = PyUnicode_DATA(unicode);
9536 assert(PyUnicode_IS_READY(unicode));
9537 assert(unicode_modifiable(unicode));
9538 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9539 assert(start >= 0);
9540 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9541 FILL(kind, data, fill_char, start, length);
9542}
9543
Victor Stinner3fe55312012-01-04 00:33:50 +01009544Py_ssize_t
9545PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9546 Py_UCS4 fill_char)
9547{
9548 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009549
9550 if (!PyUnicode_Check(unicode)) {
9551 PyErr_BadInternalCall();
9552 return -1;
9553 }
9554 if (PyUnicode_READY(unicode) == -1)
9555 return -1;
9556 if (unicode_check_modifiable(unicode))
9557 return -1;
9558
Victor Stinnerd3f08822012-05-29 12:57:52 +02009559 if (start < 0) {
9560 PyErr_SetString(PyExc_IndexError, "string index out of range");
9561 return -1;
9562 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009563 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9564 PyErr_SetString(PyExc_ValueError,
9565 "fill character is bigger than "
9566 "the string maximum character");
9567 return -1;
9568 }
9569
9570 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9571 length = Py_MIN(maxlen, length);
9572 if (length <= 0)
9573 return 0;
9574
Victor Stinnerd3f08822012-05-29 12:57:52 +02009575 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009576 return length;
9577}
9578
Victor Stinner9310abb2011-10-05 00:59:23 +02009579static PyObject *
9580pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009581 Py_ssize_t left,
9582 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009585 PyObject *u;
9586 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009587 int kind;
9588 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589
9590 if (left < 0)
9591 left = 0;
9592 if (right < 0)
9593 right = 0;
9594
Victor Stinnerc4b49542011-12-11 22:44:26 +01009595 if (left == 0 && right == 0)
9596 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009597
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009598 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9599 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009600 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9601 return NULL;
9602 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009604 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009606 if (!u)
9607 return NULL;
9608
9609 kind = PyUnicode_KIND(u);
9610 data = PyUnicode_DATA(u);
9611 if (left)
9612 FILL(kind, data, fill, 0, left);
9613 if (right)
9614 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009615 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009616 assert(_PyUnicode_CheckConsistency(u, 1));
9617 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009618}
9619
Alexander Belopolsky40018472011-02-26 01:02:56 +00009620PyObject *
9621PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009622{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009623 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009624
9625 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009626 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009627 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009628 if (PyUnicode_READY(string) == -1) {
9629 Py_DECREF(string);
9630 return NULL;
9631 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009632
Benjamin Petersonead6b532011-12-20 17:23:42 -06009633 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009635 if (PyUnicode_IS_ASCII(string))
9636 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009637 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009638 PyUnicode_GET_LENGTH(string), keepends);
9639 else
9640 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009641 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009642 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009643 break;
9644 case PyUnicode_2BYTE_KIND:
9645 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009646 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647 PyUnicode_GET_LENGTH(string), keepends);
9648 break;
9649 case PyUnicode_4BYTE_KIND:
9650 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009651 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 PyUnicode_GET_LENGTH(string), keepends);
9653 break;
9654 default:
9655 assert(0);
9656 list = 0;
9657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658 Py_DECREF(string);
9659 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660}
9661
Alexander Belopolsky40018472011-02-26 01:02:56 +00009662static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009663split(PyObject *self,
9664 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009665 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009666{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667 int kind1, kind2, kind;
9668 void *buf1, *buf2;
9669 Py_ssize_t len1, len2;
9670 PyObject* out;
9671
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009673 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675 if (PyUnicode_READY(self) == -1)
9676 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009679 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009681 if (PyUnicode_IS_ASCII(self))
9682 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009683 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009684 PyUnicode_GET_LENGTH(self), maxcount
9685 );
9686 else
9687 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009688 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009689 PyUnicode_GET_LENGTH(self), maxcount
9690 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009691 case PyUnicode_2BYTE_KIND:
9692 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009693 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694 PyUnicode_GET_LENGTH(self), maxcount
9695 );
9696 case PyUnicode_4BYTE_KIND:
9697 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009698 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 PyUnicode_GET_LENGTH(self), maxcount
9700 );
9701 default:
9702 assert(0);
9703 return NULL;
9704 }
9705
9706 if (PyUnicode_READY(substring) == -1)
9707 return NULL;
9708
9709 kind1 = PyUnicode_KIND(self);
9710 kind2 = PyUnicode_KIND(substring);
9711 kind = kind1 > kind2 ? kind1 : kind2;
9712 buf1 = PyUnicode_DATA(self);
9713 buf2 = PyUnicode_DATA(substring);
9714 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009715 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 if (!buf1)
9717 return NULL;
9718 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009719 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 if (!buf2) {
9721 if (kind1 != kind) PyMem_Free(buf1);
9722 return NULL;
9723 }
9724 len1 = PyUnicode_GET_LENGTH(self);
9725 len2 = PyUnicode_GET_LENGTH(substring);
9726
Benjamin Petersonead6b532011-12-20 17:23:42 -06009727 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009728 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009729 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9730 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009731 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009732 else
9733 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009734 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 break;
9736 case PyUnicode_2BYTE_KIND:
9737 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009738 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 break;
9740 case PyUnicode_4BYTE_KIND:
9741 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009742 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009743 break;
9744 default:
9745 out = NULL;
9746 }
9747 if (kind1 != kind)
9748 PyMem_Free(buf1);
9749 if (kind2 != kind)
9750 PyMem_Free(buf2);
9751 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009752}
9753
Alexander Belopolsky40018472011-02-26 01:02:56 +00009754static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009755rsplit(PyObject *self,
9756 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009757 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009758{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009759 int kind1, kind2, kind;
9760 void *buf1, *buf2;
9761 Py_ssize_t len1, len2;
9762 PyObject* out;
9763
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009764 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009765 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009766
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 if (PyUnicode_READY(self) == -1)
9768 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009771 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009773 if (PyUnicode_IS_ASCII(self))
9774 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009775 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009776 PyUnicode_GET_LENGTH(self), maxcount
9777 );
9778 else
9779 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009780 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009781 PyUnicode_GET_LENGTH(self), maxcount
9782 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 case PyUnicode_2BYTE_KIND:
9784 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009785 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009786 PyUnicode_GET_LENGTH(self), maxcount
9787 );
9788 case PyUnicode_4BYTE_KIND:
9789 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009790 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791 PyUnicode_GET_LENGTH(self), maxcount
9792 );
9793 default:
9794 assert(0);
9795 return NULL;
9796 }
9797
9798 if (PyUnicode_READY(substring) == -1)
9799 return NULL;
9800
9801 kind1 = PyUnicode_KIND(self);
9802 kind2 = PyUnicode_KIND(substring);
9803 kind = kind1 > kind2 ? kind1 : kind2;
9804 buf1 = PyUnicode_DATA(self);
9805 buf2 = PyUnicode_DATA(substring);
9806 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009807 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 if (!buf1)
9809 return NULL;
9810 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009811 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 if (!buf2) {
9813 if (kind1 != kind) PyMem_Free(buf1);
9814 return NULL;
9815 }
9816 len1 = PyUnicode_GET_LENGTH(self);
9817 len2 = PyUnicode_GET_LENGTH(substring);
9818
Benjamin Petersonead6b532011-12-20 17:23:42 -06009819 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009820 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009821 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9822 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009823 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009824 else
9825 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009826 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827 break;
9828 case PyUnicode_2BYTE_KIND:
9829 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009830 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 break;
9832 case PyUnicode_4BYTE_KIND:
9833 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009834 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009835 break;
9836 default:
9837 out = NULL;
9838 }
9839 if (kind1 != kind)
9840 PyMem_Free(buf1);
9841 if (kind2 != kind)
9842 PyMem_Free(buf2);
9843 return out;
9844}
9845
9846static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009847anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9848 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009850 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009852 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9853 return asciilib_find(buf1, len1, buf2, len2, offset);
9854 else
9855 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 case PyUnicode_2BYTE_KIND:
9857 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9858 case PyUnicode_4BYTE_KIND:
9859 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9860 }
9861 assert(0);
9862 return -1;
9863}
9864
9865static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009866anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9867 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009869 switch (kind) {
9870 case PyUnicode_1BYTE_KIND:
9871 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9872 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9873 else
9874 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9875 case PyUnicode_2BYTE_KIND:
9876 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9877 case PyUnicode_4BYTE_KIND:
9878 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9879 }
9880 assert(0);
9881 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009882}
9883
Alexander Belopolsky40018472011-02-26 01:02:56 +00009884static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009885replace(PyObject *self, PyObject *str1,
9886 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009887{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009888 PyObject *u;
9889 char *sbuf = PyUnicode_DATA(self);
9890 char *buf1 = PyUnicode_DATA(str1);
9891 char *buf2 = PyUnicode_DATA(str2);
9892 int srelease = 0, release1 = 0, release2 = 0;
9893 int skind = PyUnicode_KIND(self);
9894 int kind1 = PyUnicode_KIND(str1);
9895 int kind2 = PyUnicode_KIND(str2);
9896 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9897 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9898 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009899 int mayshrink;
9900 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009901
9902 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009903 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009904 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009905 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906
Victor Stinner59de0ee2011-10-07 10:01:28 +02009907 if (str1 == str2)
9908 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009909 if (skind < kind1)
9910 /* substring too wide to be present */
9911 goto nothing;
9912
Victor Stinner49a0a212011-10-12 23:46:10 +02009913 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9914 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9915 /* Replacing str1 with str2 may cause a maxchar reduction in the
9916 result string. */
9917 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +02009918 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009921 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009922 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009923 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009924 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009925 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009926 Py_UCS4 u1, u2;
9927 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +01009928 Py_ssize_t index, pos;
9929 char *src;
9930
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009931 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +01009932 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
9933 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009934 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009935 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009937 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +02009939 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +01009941
9942 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
9943 index = 0;
9944 src = sbuf;
9945 while (--maxcount)
9946 {
9947 pos++;
9948 src += pos * PyUnicode_KIND(self);
9949 slen -= pos;
9950 index += pos;
9951 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
9952 if (pos < 0)
9953 break;
9954 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
9955 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009956 }
9957 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009958 int rkind = skind;
9959 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +01009960 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +02009961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 if (kind1 < rkind) {
9963 /* widen substring */
9964 buf1 = _PyUnicode_AsKind(str1, rkind);
9965 if (!buf1) goto error;
9966 release1 = 1;
9967 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009968 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009969 if (i < 0)
9970 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009971 if (rkind > kind2) {
9972 /* widen replacement */
9973 buf2 = _PyUnicode_AsKind(str2, rkind);
9974 if (!buf2) goto error;
9975 release2 = 1;
9976 }
9977 else if (rkind < kind2) {
9978 /* widen self and buf1 */
9979 rkind = kind2;
9980 if (release1) PyMem_Free(buf1);
9981 sbuf = _PyUnicode_AsKind(self, rkind);
9982 if (!sbuf) goto error;
9983 srelease = 1;
9984 buf1 = _PyUnicode_AsKind(str1, rkind);
9985 if (!buf1) goto error;
9986 release1 = 1;
9987 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009988 u = PyUnicode_New(slen, maxchar);
9989 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009991 assert(PyUnicode_KIND(u) == rkind);
9992 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +02009993
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009994 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009995 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009996 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009998 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009999 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010000
10001 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010002 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010003 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010004 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010005 if (i == -1)
10006 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010007 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010009 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010011 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010013 }
10014 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010016 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 int rkind = skind;
10018 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010021 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 buf1 = _PyUnicode_AsKind(str1, rkind);
10023 if (!buf1) goto error;
10024 release1 = 1;
10025 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010026 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010027 if (n == 0)
10028 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010030 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 buf2 = _PyUnicode_AsKind(str2, rkind);
10032 if (!buf2) goto error;
10033 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010036 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 rkind = kind2;
10038 sbuf = _PyUnicode_AsKind(self, rkind);
10039 if (!sbuf) goto error;
10040 srelease = 1;
10041 if (release1) PyMem_Free(buf1);
10042 buf1 = _PyUnicode_AsKind(str1, rkind);
10043 if (!buf1) goto error;
10044 release1 = 1;
10045 }
10046 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10047 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010048 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 PyErr_SetString(PyExc_OverflowError,
10050 "replace string is too long");
10051 goto error;
10052 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010053 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010054 if (new_size == 0) {
10055 Py_INCREF(unicode_empty);
10056 u = unicode_empty;
10057 goto done;
10058 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010059 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 PyErr_SetString(PyExc_OverflowError,
10061 "replace string is too long");
10062 goto error;
10063 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010064 u = PyUnicode_New(new_size, maxchar);
10065 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010067 assert(PyUnicode_KIND(u) == rkind);
10068 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 ires = i = 0;
10070 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010071 while (n-- > 0) {
10072 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010073 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010074 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010075 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010076 if (j == -1)
10077 break;
10078 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010079 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010080 memcpy(res + rkind * ires,
10081 sbuf + rkind * i,
10082 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010084 }
10085 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010087 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010088 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010089 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010093 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010094 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010095 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010096 memcpy(res + rkind * ires,
10097 sbuf + rkind * i,
10098 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010099 }
10100 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010101 /* interleave */
10102 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010103 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010105 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010107 if (--n <= 0)
10108 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010109 memcpy(res + rkind * ires,
10110 sbuf + rkind * i,
10111 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 ires++;
10113 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010114 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010115 memcpy(res + rkind * ires,
10116 sbuf + rkind * i,
10117 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010118 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010119 }
10120
10121 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010122 unicode_adjust_maxchar(&u);
10123 if (u == NULL)
10124 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010126
10127 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 if (srelease)
10129 PyMem_FREE(sbuf);
10130 if (release1)
10131 PyMem_FREE(buf1);
10132 if (release2)
10133 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010134 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010136
Benjamin Peterson29060642009-01-31 22:14:21 +000010137 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010138 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 if (srelease)
10140 PyMem_FREE(sbuf);
10141 if (release1)
10142 PyMem_FREE(buf1);
10143 if (release2)
10144 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010145 return unicode_result_unchanged(self);
10146
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 error:
10148 if (srelease && sbuf)
10149 PyMem_FREE(sbuf);
10150 if (release1 && buf1)
10151 PyMem_FREE(buf1);
10152 if (release2 && buf2)
10153 PyMem_FREE(buf2);
10154 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010155}
10156
10157/* --- Unicode Object Methods --------------------------------------------- */
10158
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010159PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010160 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161\n\
10162Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010163characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010164
10165static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010166unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010168 if (PyUnicode_READY(self) == -1)
10169 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010170 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010171}
10172
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010173PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010174 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175\n\
10176Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010177have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178
10179static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010180unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010181{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010182 if (PyUnicode_READY(self) == -1)
10183 return NULL;
10184 if (PyUnicode_GET_LENGTH(self) == 0)
10185 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010186 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010187}
10188
Benjamin Petersond5890c82012-01-14 13:23:30 -050010189PyDoc_STRVAR(casefold__doc__,
10190 "S.casefold() -> str\n\
10191\n\
10192Return a version of S suitable for caseless comparisons.");
10193
10194static PyObject *
10195unicode_casefold(PyObject *self)
10196{
10197 if (PyUnicode_READY(self) == -1)
10198 return NULL;
10199 if (PyUnicode_IS_ASCII(self))
10200 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010201 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010202}
10203
10204
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010205/* Argument converter. Coerces to a single unicode character */
10206
10207static int
10208convert_uc(PyObject *obj, void *addr)
10209{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010211 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010212
Benjamin Peterson14339b62009-01-31 16:36:08 +000010213 uniobj = PyUnicode_FromObject(obj);
10214 if (uniobj == NULL) {
10215 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010216 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010217 return 0;
10218 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010220 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010221 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010222 Py_DECREF(uniobj);
10223 return 0;
10224 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010226 Py_DECREF(uniobj);
10227 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010228}
10229
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010230PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010231 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010232\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010233Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010234done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235
10236static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010237unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010239 Py_ssize_t marg, left;
10240 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 Py_UCS4 fillchar = ' ';
10242
Victor Stinnere9a29352011-10-01 02:14:59 +020010243 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010245
Benjamin Petersonbac79492012-01-14 13:34:47 -050010246 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247 return NULL;
10248
Victor Stinnerc4b49542011-12-11 22:44:26 +010010249 if (PyUnicode_GET_LENGTH(self) >= width)
10250 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251
Victor Stinnerc4b49542011-12-11 22:44:26 +010010252 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253 left = marg / 2 + (marg & width & 1);
10254
Victor Stinner9310abb2011-10-05 00:59:23 +020010255 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256}
10257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258/* This function assumes that str1 and str2 are readied by the caller. */
10259
Marc-André Lemburge5034372000-08-08 08:04:29 +000010260static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010261unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010262{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 int kind1, kind2;
10264 void *data1, *data2;
Victor Stinner770e19e2012-10-04 22:59:45 +020010265 Py_ssize_t len1, len2;
10266 Py_ssize_t i, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010267
Victor Stinner90db9c42012-10-04 21:53:50 +020010268 /* a string is equal to itself */
10269 if (str1 == str2)
10270 return 0;
10271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 kind1 = PyUnicode_KIND(str1);
10273 kind2 = PyUnicode_KIND(str2);
10274 data1 = PyUnicode_DATA(str1);
10275 data2 = PyUnicode_DATA(str2);
10276 len1 = PyUnicode_GET_LENGTH(str1);
10277 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010278 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010279
Victor Stinner770e19e2012-10-04 22:59:45 +020010280 if (kind1 == 1 && kind2 == 1) {
10281 int cmp = memcmp(data1, data2, len);
10282 /* normalize result of memcmp() into the range [-1; 1] */
10283 if (cmp < 0)
10284 return -1;
10285 if (cmp > 0)
10286 return 1;
10287 }
10288 else {
10289 for (i = 0; i < len; ++i) {
10290 Py_UCS4 c1, c2;
10291 c1 = PyUnicode_READ(kind1, data1, i);
10292 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010293
Victor Stinner770e19e2012-10-04 22:59:45 +020010294 if (c1 != c2)
10295 return (c1 < c2) ? -1 : 1;
10296 }
Marc-André Lemburge5034372000-08-08 08:04:29 +000010297 }
10298
Victor Stinner770e19e2012-10-04 22:59:45 +020010299 if (len1 == len2)
10300 return 0;
10301 if (len1 < len2)
10302 return -1;
10303 else
10304 return 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010305}
10306
Alexander Belopolsky40018472011-02-26 01:02:56 +000010307int
10308PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010309{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10311 if (PyUnicode_READY(left) == -1 ||
10312 PyUnicode_READY(right) == -1)
10313 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010314 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010316 PyErr_Format(PyExc_TypeError,
10317 "Can't compare %.100s and %.100s",
10318 left->ob_type->tp_name,
10319 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320 return -1;
10321}
10322
Martin v. Löwis5b222132007-06-10 09:51:05 +000010323int
10324PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10325{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 Py_ssize_t i;
10327 int kind;
10328 void *data;
10329 Py_UCS4 chr;
10330
Victor Stinner910337b2011-10-03 03:20:16 +020010331 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 if (PyUnicode_READY(uni) == -1)
10333 return -1;
10334 kind = PyUnicode_KIND(uni);
10335 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010336 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10338 if (chr != str[i])
10339 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010340 /* This check keeps Python strings that end in '\0' from comparing equal
10341 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010343 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010344 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010345 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010346 return 0;
10347}
10348
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010349
Benjamin Peterson29060642009-01-31 22:14:21 +000010350#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010351 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010352
Alexander Belopolsky40018472011-02-26 01:02:56 +000010353PyObject *
10354PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010355{
10356 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010357
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010358 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10359 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 if (PyUnicode_READY(left) == -1 ||
10361 PyUnicode_READY(right) == -1)
10362 return NULL;
10363 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10364 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010365 if (op == Py_EQ) {
10366 Py_INCREF(Py_False);
10367 return Py_False;
10368 }
10369 if (op == Py_NE) {
10370 Py_INCREF(Py_True);
10371 return Py_True;
10372 }
10373 }
Victor Stinner90db9c42012-10-04 21:53:50 +020010374 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010375
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010376 /* Convert the return value to a Boolean */
10377 switch (op) {
10378 case Py_EQ:
10379 v = TEST_COND(result == 0);
10380 break;
10381 case Py_NE:
10382 v = TEST_COND(result != 0);
10383 break;
10384 case Py_LE:
10385 v = TEST_COND(result <= 0);
10386 break;
10387 case Py_GE:
10388 v = TEST_COND(result >= 0);
10389 break;
10390 case Py_LT:
10391 v = TEST_COND(result == -1);
10392 break;
10393 case Py_GT:
10394 v = TEST_COND(result == 1);
10395 break;
10396 default:
10397 PyErr_BadArgument();
10398 return NULL;
10399 }
10400 Py_INCREF(v);
10401 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010402 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010403
Brian Curtindfc80e32011-08-10 20:28:54 -050010404 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010405}
10406
Alexander Belopolsky40018472011-02-26 01:02:56 +000010407int
10408PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010409{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010410 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411 int kind1, kind2, kind;
10412 void *buf1, *buf2;
10413 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010414 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010415
10416 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010417 sub = PyUnicode_FromObject(element);
10418 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010419 PyErr_Format(PyExc_TypeError,
10420 "'in <string>' requires string as left operand, not %s",
10421 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010422 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010423 }
10424
Thomas Wouters477c8d52006-05-27 19:21:47 +000010425 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010426 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010427 Py_DECREF(sub);
10428 return -1;
10429 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010430 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10431 Py_DECREF(sub);
10432 Py_DECREF(str);
10433 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 kind1 = PyUnicode_KIND(str);
10436 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010437 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 buf1 = PyUnicode_DATA(str);
10439 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010440 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010441 if (kind2 > kind) {
10442 Py_DECREF(sub);
10443 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010444 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010445 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010446 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010447 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 if (!buf2) {
10449 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010450 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 return -1;
10452 }
10453 len1 = PyUnicode_GET_LENGTH(str);
10454 len2 = PyUnicode_GET_LENGTH(sub);
10455
Benjamin Petersonead6b532011-12-20 17:23:42 -060010456 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 case PyUnicode_1BYTE_KIND:
10458 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10459 break;
10460 case PyUnicode_2BYTE_KIND:
10461 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10462 break;
10463 case PyUnicode_4BYTE_KIND:
10464 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10465 break;
10466 default:
10467 result = -1;
10468 assert(0);
10469 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010470
10471 Py_DECREF(str);
10472 Py_DECREF(sub);
10473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 if (kind2 != kind)
10475 PyMem_Free(buf2);
10476
Guido van Rossum403d68b2000-03-13 15:55:09 +000010477 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010478}
10479
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480/* Concat to string or Unicode object giving a new Unicode object. */
10481
Alexander Belopolsky40018472011-02-26 01:02:56 +000010482PyObject *
10483PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010486 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010487 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010488
10489 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010492 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010494 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010495 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496
10497 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010498 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010499 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010501 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010502 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010503 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505 }
10506
Victor Stinner488fa492011-12-12 00:01:39 +010010507 u_len = PyUnicode_GET_LENGTH(u);
10508 v_len = PyUnicode_GET_LENGTH(v);
10509 if (u_len > PY_SSIZE_T_MAX - v_len) {
10510 PyErr_SetString(PyExc_OverflowError,
10511 "strings are too large to concat");
10512 goto onError;
10513 }
10514 new_len = u_len + v_len;
10515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010517 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010518 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519
Guido van Rossumd57fd912000-03-10 22:53:23 +000010520 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010521 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010523 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010524 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10525 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010526 Py_DECREF(u);
10527 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010528 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010529 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530
Benjamin Peterson29060642009-01-31 22:14:21 +000010531 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532 Py_XDECREF(u);
10533 Py_XDECREF(v);
10534 return NULL;
10535}
10536
Walter Dörwald1ab83302007-05-18 17:15:44 +000010537void
Victor Stinner23e56682011-10-03 03:54:37 +020010538PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010539{
Victor Stinner23e56682011-10-03 03:54:37 +020010540 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010541 Py_UCS4 maxchar, maxchar2;
10542 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010543
10544 if (p_left == NULL) {
10545 if (!PyErr_Occurred())
10546 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010547 return;
10548 }
Victor Stinner23e56682011-10-03 03:54:37 +020010549 left = *p_left;
10550 if (right == NULL || !PyUnicode_Check(left)) {
10551 if (!PyErr_Occurred())
10552 PyErr_BadInternalCall();
10553 goto error;
10554 }
10555
Benjamin Petersonbac79492012-01-14 13:34:47 -050010556 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010557 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010558 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010559 goto error;
10560
Victor Stinner488fa492011-12-12 00:01:39 +010010561 /* Shortcuts */
10562 if (left == unicode_empty) {
10563 Py_DECREF(left);
10564 Py_INCREF(right);
10565 *p_left = right;
10566 return;
10567 }
10568 if (right == unicode_empty)
10569 return;
10570
10571 left_len = PyUnicode_GET_LENGTH(left);
10572 right_len = PyUnicode_GET_LENGTH(right);
10573 if (left_len > PY_SSIZE_T_MAX - right_len) {
10574 PyErr_SetString(PyExc_OverflowError,
10575 "strings are too large to concat");
10576 goto error;
10577 }
10578 new_len = left_len + right_len;
10579
10580 if (unicode_modifiable(left)
10581 && PyUnicode_CheckExact(right)
10582 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010583 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10584 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010585 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010586 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010587 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10588 {
10589 /* append inplace */
10590 if (unicode_resize(p_left, new_len) != 0) {
10591 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10592 * deallocated so it cannot be put back into
10593 * 'variable'. The MemoryError is raised when there
10594 * is no value in 'variable', which might (very
10595 * remotely) be a cause of incompatibilities.
10596 */
10597 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010598 }
Victor Stinner488fa492011-12-12 00:01:39 +010010599 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010600 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010601 }
Victor Stinner488fa492011-12-12 00:01:39 +010010602 else {
10603 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10604 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010605 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010606
Victor Stinner488fa492011-12-12 00:01:39 +010010607 /* Concat the two Unicode strings */
10608 res = PyUnicode_New(new_len, maxchar);
10609 if (res == NULL)
10610 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010611 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10612 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010613 Py_DECREF(left);
10614 *p_left = res;
10615 }
10616 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010617 return;
10618
10619error:
Victor Stinner488fa492011-12-12 00:01:39 +010010620 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010621}
10622
10623void
10624PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10625{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010626 PyUnicode_Append(pleft, right);
10627 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010628}
10629
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010630PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010631 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010633Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010634string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010635interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010636
10637static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010638unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010640 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010641 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010642 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 int kind1, kind2, kind;
10645 void *buf1, *buf2;
10646 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010647
Jesus Ceaac451502011-04-20 17:09:23 +020010648 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10649 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010650 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010651
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 kind1 = PyUnicode_KIND(self);
10653 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010654 if (kind2 > kind1)
10655 return PyLong_FromLong(0);
10656 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 buf1 = PyUnicode_DATA(self);
10658 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010660 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 if (!buf2) {
10662 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 return NULL;
10664 }
10665 len1 = PyUnicode_GET_LENGTH(self);
10666 len2 = PyUnicode_GET_LENGTH(substring);
10667
10668 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010669 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 case PyUnicode_1BYTE_KIND:
10671 iresult = ucs1lib_count(
10672 ((Py_UCS1*)buf1) + start, end - start,
10673 buf2, len2, PY_SSIZE_T_MAX
10674 );
10675 break;
10676 case PyUnicode_2BYTE_KIND:
10677 iresult = ucs2lib_count(
10678 ((Py_UCS2*)buf1) + start, end - start,
10679 buf2, len2, PY_SSIZE_T_MAX
10680 );
10681 break;
10682 case PyUnicode_4BYTE_KIND:
10683 iresult = ucs4lib_count(
10684 ((Py_UCS4*)buf1) + start, end - start,
10685 buf2, len2, PY_SSIZE_T_MAX
10686 );
10687 break;
10688 default:
10689 assert(0); iresult = 0;
10690 }
10691
10692 result = PyLong_FromSsize_t(iresult);
10693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 if (kind2 != kind)
10695 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696
10697 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010698
Guido van Rossumd57fd912000-03-10 22:53:23 +000010699 return result;
10700}
10701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010702PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010703 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010705Encode S using the codec registered for encoding. Default encoding\n\
10706is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010707handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010708a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10709'xmlcharrefreplace' as well as any other name registered with\n\
10710codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711
10712static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010713unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010715 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716 char *encoding = NULL;
10717 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010718
Benjamin Peterson308d6372009-09-18 21:42:35 +000010719 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10720 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010721 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010722 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010723}
10724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010725PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010726 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727\n\
10728Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010729If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730
10731static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010732unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010734 Py_ssize_t i, j, line_pos, src_len, incr;
10735 Py_UCS4 ch;
10736 PyObject *u;
10737 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010739 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010740 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741
10742 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010744
Antoine Pitrou22425222011-10-04 19:10:51 +020010745 if (PyUnicode_READY(self) == -1)
10746 return NULL;
10747
Thomas Wouters7e474022000-07-16 12:04:32 +000010748 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010749 src_len = PyUnicode_GET_LENGTH(self);
10750 i = j = line_pos = 0;
10751 kind = PyUnicode_KIND(self);
10752 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010753 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010754 for (; i < src_len; i++) {
10755 ch = PyUnicode_READ(kind, src_data, i);
10756 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010757 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010758 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010759 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010760 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010761 goto overflow;
10762 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010763 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010764 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010765 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010767 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010768 goto overflow;
10769 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010771 if (ch == '\n' || ch == '\r')
10772 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010774 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010775 if (!found)
10776 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010777
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010779 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780 if (!u)
10781 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010782 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783
Antoine Pitroue71d5742011-10-04 15:55:09 +020010784 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785
Antoine Pitroue71d5742011-10-04 15:55:09 +020010786 for (; i < src_len; i++) {
10787 ch = PyUnicode_READ(kind, src_data, i);
10788 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010789 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010790 incr = tabsize - (line_pos % tabsize);
10791 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010792 FILL(kind, dest_data, ' ', j, incr);
10793 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010794 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010795 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010796 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010797 line_pos++;
10798 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010799 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010800 if (ch == '\n' || ch == '\r')
10801 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010802 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010803 }
10804 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010805 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010806
Antoine Pitroue71d5742011-10-04 15:55:09 +020010807 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010808 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10809 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010810}
10811
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010812PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010813 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010814\n\
10815Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010816such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817arguments start and end are interpreted as in slice notation.\n\
10818\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010819Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820
10821static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010822unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010824 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010825 Py_ssize_t start;
10826 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010827 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828
Jesus Ceaac451502011-04-20 17:09:23 +020010829 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10830 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 if (PyUnicode_READY(self) == -1)
10834 return NULL;
10835 if (PyUnicode_READY(substring) == -1)
10836 return NULL;
10837
Victor Stinner7931d9a2011-11-04 00:22:48 +010010838 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839
10840 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842 if (result == -2)
10843 return NULL;
10844
Christian Heimes217cfd12007-12-02 14:31:20 +000010845 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846}
10847
10848static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010849unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010851 void *data;
10852 enum PyUnicode_Kind kind;
10853 Py_UCS4 ch;
10854 PyObject *res;
10855
10856 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10857 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010859 }
10860 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10861 PyErr_SetString(PyExc_IndexError, "string index out of range");
10862 return NULL;
10863 }
10864 kind = PyUnicode_KIND(self);
10865 data = PyUnicode_DATA(self);
10866 ch = PyUnicode_READ(kind, data, index);
10867 if (ch < 256)
10868 return get_latin1_char(ch);
10869
10870 res = PyUnicode_New(1, ch);
10871 if (res == NULL)
10872 return NULL;
10873 kind = PyUnicode_KIND(res);
10874 data = PyUnicode_DATA(res);
10875 PyUnicode_WRITE(kind, data, 0, ch);
10876 assert(_PyUnicode_CheckConsistency(res, 1));
10877 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010878}
10879
Guido van Rossumc2504932007-09-18 19:42:40 +000010880/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010881 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010882static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010883unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010884{
Guido van Rossumc2504932007-09-18 19:42:40 +000010885 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010886 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010887
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010888#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050010889 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010890#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891 if (_PyUnicode_HASH(self) != -1)
10892 return _PyUnicode_HASH(self);
10893 if (PyUnicode_READY(self) == -1)
10894 return -1;
10895 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010010896 /*
10897 We make the hash of the empty string be 0, rather than using
10898 (prefix ^ suffix), since this slightly obfuscates the hash secret
10899 */
10900 if (len == 0) {
10901 _PyUnicode_HASH(self) = 0;
10902 return 0;
10903 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010904
10905 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010010906#define HASH(P) \
10907 x ^= (Py_uhash_t) *P << 7; \
10908 while (--len >= 0) \
10909 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910
Georg Brandl2fb477c2012-02-21 00:33:36 +010010911 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912 switch (PyUnicode_KIND(self)) {
10913 case PyUnicode_1BYTE_KIND: {
10914 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10915 HASH(c);
10916 break;
10917 }
10918 case PyUnicode_2BYTE_KIND: {
10919 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10920 HASH(s);
10921 break;
10922 }
10923 default: {
10924 Py_UCS4 *l;
10925 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10926 "Impossible switch case in unicode_hash");
10927 l = PyUnicode_4BYTE_DATA(self);
10928 HASH(l);
10929 break;
10930 }
10931 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010010932 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
10933 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010934
Guido van Rossumc2504932007-09-18 19:42:40 +000010935 if (x == -1)
10936 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010937 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010938 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010940#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010942PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010943 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010945Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946
10947static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010948unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010950 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010951 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010952 Py_ssize_t start;
10953 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954
Jesus Ceaac451502011-04-20 17:09:23 +020010955 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10956 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010959 if (PyUnicode_READY(self) == -1)
10960 return NULL;
10961 if (PyUnicode_READY(substring) == -1)
10962 return NULL;
10963
Victor Stinner7931d9a2011-11-04 00:22:48 +010010964 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965
10966 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010967
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 if (result == -2)
10969 return NULL;
10970
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971 if (result < 0) {
10972 PyErr_SetString(PyExc_ValueError, "substring not found");
10973 return NULL;
10974 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010975
Christian Heimes217cfd12007-12-02 14:31:20 +000010976 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977}
10978
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010979PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010980 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010982Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010983at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984
10985static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010986unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 Py_ssize_t i, length;
10989 int kind;
10990 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991 int cased;
10992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010993 if (PyUnicode_READY(self) == -1)
10994 return NULL;
10995 length = PyUnicode_GET_LENGTH(self);
10996 kind = PyUnicode_KIND(self);
10997 data = PyUnicode_DATA(self);
10998
Guido van Rossumd57fd912000-03-10 22:53:23 +000010999 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011000 if (length == 1)
11001 return PyBool_FromLong(
11002 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011004 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011006 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011007
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 for (i = 0; i < length; i++) {
11010 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011011
Benjamin Peterson29060642009-01-31 22:14:21 +000011012 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11013 return PyBool_FromLong(0);
11014 else if (!cased && Py_UNICODE_ISLOWER(ch))
11015 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011016 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011017 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018}
11019
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011020PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011021 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011023Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011024at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011025
11026static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011027unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011028{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 Py_ssize_t i, length;
11030 int kind;
11031 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032 int cased;
11033
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 if (PyUnicode_READY(self) == -1)
11035 return NULL;
11036 length = PyUnicode_GET_LENGTH(self);
11037 kind = PyUnicode_KIND(self);
11038 data = PyUnicode_DATA(self);
11039
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011041 if (length == 1)
11042 return PyBool_FromLong(
11043 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011044
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011045 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011046 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011047 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011048
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050 for (i = 0; i < length; i++) {
11051 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011052
Benjamin Peterson29060642009-01-31 22:14:21 +000011053 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11054 return PyBool_FromLong(0);
11055 else if (!cased && Py_UNICODE_ISUPPER(ch))
11056 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011058 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059}
11060
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011061PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011062 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011064Return True if S is a titlecased string and there is at least one\n\
11065character in S, i.e. upper- and titlecase characters may only\n\
11066follow uncased characters and lowercase characters only cased ones.\n\
11067Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068
11069static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011070unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011072 Py_ssize_t i, length;
11073 int kind;
11074 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075 int cased, previous_is_cased;
11076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011077 if (PyUnicode_READY(self) == -1)
11078 return NULL;
11079 length = PyUnicode_GET_LENGTH(self);
11080 kind = PyUnicode_KIND(self);
11081 data = PyUnicode_DATA(self);
11082
Guido van Rossumd57fd912000-03-10 22:53:23 +000011083 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084 if (length == 1) {
11085 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11086 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11087 (Py_UNICODE_ISUPPER(ch) != 0));
11088 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011089
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011090 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011091 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011092 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011093
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094 cased = 0;
11095 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011096 for (i = 0; i < length; i++) {
11097 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011098
Benjamin Peterson29060642009-01-31 22:14:21 +000011099 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11100 if (previous_is_cased)
11101 return PyBool_FromLong(0);
11102 previous_is_cased = 1;
11103 cased = 1;
11104 }
11105 else if (Py_UNICODE_ISLOWER(ch)) {
11106 if (!previous_is_cased)
11107 return PyBool_FromLong(0);
11108 previous_is_cased = 1;
11109 cased = 1;
11110 }
11111 else
11112 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011113 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011114 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011115}
11116
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011117PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011118 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011119\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011120Return True if all characters in S are whitespace\n\
11121and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122
11123static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011124unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011126 Py_ssize_t i, length;
11127 int kind;
11128 void *data;
11129
11130 if (PyUnicode_READY(self) == -1)
11131 return NULL;
11132 length = PyUnicode_GET_LENGTH(self);
11133 kind = PyUnicode_KIND(self);
11134 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011137 if (length == 1)
11138 return PyBool_FromLong(
11139 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011141 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011142 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011143 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011144
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011145 for (i = 0; i < length; i++) {
11146 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011147 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011148 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011150 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151}
11152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011153PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011154 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011155\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011156Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011157and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011158
11159static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011160unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 Py_ssize_t i, length;
11163 int kind;
11164 void *data;
11165
11166 if (PyUnicode_READY(self) == -1)
11167 return NULL;
11168 length = PyUnicode_GET_LENGTH(self);
11169 kind = PyUnicode_KIND(self);
11170 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011171
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011172 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 if (length == 1)
11174 return PyBool_FromLong(
11175 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011176
11177 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011178 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011179 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011181 for (i = 0; i < length; i++) {
11182 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011183 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011184 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011185 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011186}
11187
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011188PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011189 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011190\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011191Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011192and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011193
11194static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011195unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011196{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011197 int kind;
11198 void *data;
11199 Py_ssize_t len, i;
11200
11201 if (PyUnicode_READY(self) == -1)
11202 return NULL;
11203
11204 kind = PyUnicode_KIND(self);
11205 data = PyUnicode_DATA(self);
11206 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011207
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011208 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011209 if (len == 1) {
11210 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11211 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11212 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011213
11214 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011216 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011218 for (i = 0; i < len; i++) {
11219 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011220 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011221 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011222 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011223 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011224}
11225
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011226PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011227 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011229Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011230False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231
11232static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011233unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011235 Py_ssize_t i, length;
11236 int kind;
11237 void *data;
11238
11239 if (PyUnicode_READY(self) == -1)
11240 return NULL;
11241 length = PyUnicode_GET_LENGTH(self);
11242 kind = PyUnicode_KIND(self);
11243 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011246 if (length == 1)
11247 return PyBool_FromLong(
11248 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011249
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011250 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011251 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011252 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011254 for (i = 0; i < length; i++) {
11255 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011256 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011258 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259}
11260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011261PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011262 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011264Return True if all characters in S are digits\n\
11265and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266
11267static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011268unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270 Py_ssize_t i, length;
11271 int kind;
11272 void *data;
11273
11274 if (PyUnicode_READY(self) == -1)
11275 return NULL;
11276 length = PyUnicode_GET_LENGTH(self);
11277 kind = PyUnicode_KIND(self);
11278 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011281 if (length == 1) {
11282 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11283 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11284 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011286 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011288 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011290 for (i = 0; i < length; i++) {
11291 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011292 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011294 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295}
11296
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011297PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011298 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011299\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011300Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011301False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302
11303static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011304unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 Py_ssize_t i, length;
11307 int kind;
11308 void *data;
11309
11310 if (PyUnicode_READY(self) == -1)
11311 return NULL;
11312 length = PyUnicode_GET_LENGTH(self);
11313 kind = PyUnicode_KIND(self);
11314 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 if (length == 1)
11318 return PyBool_FromLong(
11319 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011321 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011323 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325 for (i = 0; i < length; i++) {
11326 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011327 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011329 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330}
11331
Martin v. Löwis47383402007-08-15 07:32:56 +000011332int
11333PyUnicode_IsIdentifier(PyObject *self)
11334{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011335 int kind;
11336 void *data;
11337 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011338 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340 if (PyUnicode_READY(self) == -1) {
11341 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011342 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 }
11344
11345 /* Special case for empty strings */
11346 if (PyUnicode_GET_LENGTH(self) == 0)
11347 return 0;
11348 kind = PyUnicode_KIND(self);
11349 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011350
11351 /* PEP 3131 says that the first character must be in
11352 XID_Start and subsequent characters in XID_Continue,
11353 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011354 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011355 letters, digits, underscore). However, given the current
11356 definition of XID_Start and XID_Continue, it is sufficient
11357 to check just for these, except that _ must be allowed
11358 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011360 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011361 return 0;
11362
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011363 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011365 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011366 return 1;
11367}
11368
11369PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011370 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011371\n\
11372Return True if S is a valid identifier according\n\
11373to the language definition.");
11374
11375static PyObject*
11376unicode_isidentifier(PyObject *self)
11377{
11378 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11379}
11380
Georg Brandl559e5d72008-06-11 18:37:52 +000011381PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011382 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011383\n\
11384Return True if all characters in S are considered\n\
11385printable in repr() or S is empty, False otherwise.");
11386
11387static PyObject*
11388unicode_isprintable(PyObject *self)
11389{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011390 Py_ssize_t i, length;
11391 int kind;
11392 void *data;
11393
11394 if (PyUnicode_READY(self) == -1)
11395 return NULL;
11396 length = PyUnicode_GET_LENGTH(self);
11397 kind = PyUnicode_KIND(self);
11398 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011399
11400 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011401 if (length == 1)
11402 return PyBool_FromLong(
11403 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011405 for (i = 0; i < length; i++) {
11406 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011407 Py_RETURN_FALSE;
11408 }
11409 }
11410 Py_RETURN_TRUE;
11411}
11412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011413PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011414 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011415\n\
11416Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011417iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418
11419static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011420unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011422 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423}
11424
Martin v. Löwis18e16552006-02-15 17:27:45 +000011425static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011426unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 if (PyUnicode_READY(self) == -1)
11429 return -1;
11430 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431}
11432
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011433PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011434 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011436Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011437done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438
11439static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011440unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011442 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443 Py_UCS4 fillchar = ' ';
11444
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011445 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446 return NULL;
11447
Benjamin Petersonbac79492012-01-14 13:34:47 -050011448 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450
Victor Stinnerc4b49542011-12-11 22:44:26 +010011451 if (PyUnicode_GET_LENGTH(self) >= width)
11452 return unicode_result_unchanged(self);
11453
11454 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455}
11456
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011457PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011458 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011460Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
11462static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011463unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011465 if (PyUnicode_READY(self) == -1)
11466 return NULL;
11467 if (PyUnicode_IS_ASCII(self))
11468 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011469 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470}
11471
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011472#define LEFTSTRIP 0
11473#define RIGHTSTRIP 1
11474#define BOTHSTRIP 2
11475
11476/* Arrays indexed by above */
11477static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11478
11479#define STRIPNAME(i) (stripformat[i]+3)
11480
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011481/* externally visible for str.strip(unicode) */
11482PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011483_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011484{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011485 void *data;
11486 int kind;
11487 Py_ssize_t i, j, len;
11488 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011490 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11491 return NULL;
11492
11493 kind = PyUnicode_KIND(self);
11494 data = PyUnicode_DATA(self);
11495 len = PyUnicode_GET_LENGTH(self);
11496 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11497 PyUnicode_DATA(sepobj),
11498 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011499
Benjamin Peterson14339b62009-01-31 16:36:08 +000011500 i = 0;
11501 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 while (i < len &&
11503 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011504 i++;
11505 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011506 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011507
Benjamin Peterson14339b62009-01-31 16:36:08 +000011508 j = len;
11509 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011510 do {
11511 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 } while (j >= i &&
11513 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011514 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011515 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011516
Victor Stinner7931d9a2011-11-04 00:22:48 +010011517 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518}
11519
11520PyObject*
11521PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11522{
11523 unsigned char *data;
11524 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011525 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526
Victor Stinnerde636f32011-10-01 03:55:54 +020011527 if (PyUnicode_READY(self) == -1)
11528 return NULL;
11529
Victor Stinner684d5fd2012-05-03 02:32:34 +020011530 length = PyUnicode_GET_LENGTH(self);
11531 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011532
Victor Stinner684d5fd2012-05-03 02:32:34 +020011533 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011534 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535
Victor Stinnerde636f32011-10-01 03:55:54 +020011536 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011537 PyErr_SetString(PyExc_IndexError, "string index out of range");
11538 return NULL;
11539 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020011540 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020011541 Py_INCREF(unicode_empty);
11542 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020011543 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020011544
Victor Stinner684d5fd2012-05-03 02:32:34 +020011545 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011546 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011547 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011548 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011549 }
11550 else {
11551 kind = PyUnicode_KIND(self);
11552 data = PyUnicode_1BYTE_DATA(self);
11553 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011554 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011555 length);
11556 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558
11559static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011560do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562 int kind;
11563 void *data;
11564 Py_ssize_t len, i, j;
11565
11566 if (PyUnicode_READY(self) == -1)
11567 return NULL;
11568
11569 kind = PyUnicode_KIND(self);
11570 data = PyUnicode_DATA(self);
11571 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011572
Benjamin Peterson14339b62009-01-31 16:36:08 +000011573 i = 0;
11574 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011576 i++;
11577 }
11578 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011579
Benjamin Peterson14339b62009-01-31 16:36:08 +000011580 j = len;
11581 if (striptype != LEFTSTRIP) {
11582 do {
11583 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011585 j++;
11586 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011587
Victor Stinner7931d9a2011-11-04 00:22:48 +010011588 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589}
11590
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011591
11592static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011593do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011594{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011595 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011596
Benjamin Peterson14339b62009-01-31 16:36:08 +000011597 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11598 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011599
Benjamin Peterson14339b62009-01-31 16:36:08 +000011600 if (sep != NULL && sep != Py_None) {
11601 if (PyUnicode_Check(sep))
11602 return _PyUnicode_XStrip(self, striptype, sep);
11603 else {
11604 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011605 "%s arg must be None or str",
11606 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011607 return NULL;
11608 }
11609 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011610
Benjamin Peterson14339b62009-01-31 16:36:08 +000011611 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011612}
11613
11614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011615PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011616 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011617\n\
11618Return a copy of the string S with leading and trailing\n\
11619whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011620If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011621
11622static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011623unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011624{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011625 if (PyTuple_GET_SIZE(args) == 0)
11626 return do_strip(self, BOTHSTRIP); /* Common case */
11627 else
11628 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011629}
11630
11631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011632PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011634\n\
11635Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011636If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011637
11638static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011639unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011640{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011641 if (PyTuple_GET_SIZE(args) == 0)
11642 return do_strip(self, LEFTSTRIP); /* Common case */
11643 else
11644 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011645}
11646
11647
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011648PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011649 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011650\n\
11651Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011652If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011653
11654static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011655unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011656{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011657 if (PyTuple_GET_SIZE(args) == 0)
11658 return do_strip(self, RIGHTSTRIP); /* Common case */
11659 else
11660 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011661}
11662
11663
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011665unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011667 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011668 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669
Georg Brandl222de0f2009-04-12 12:01:50 +000011670 if (len < 1) {
11671 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011672 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011673 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674
Victor Stinnerc4b49542011-12-11 22:44:26 +010011675 /* no repeat, return original string */
11676 if (len == 1)
11677 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011678
Benjamin Petersonbac79492012-01-14 13:34:47 -050011679 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 return NULL;
11681
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011682 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011683 PyErr_SetString(PyExc_OverflowError,
11684 "repeated string is too long");
11685 return NULL;
11686 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011688
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011689 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690 if (!u)
11691 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011692 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 if (PyUnicode_GET_LENGTH(str) == 1) {
11695 const int kind = PyUnicode_KIND(str);
11696 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011697 if (kind == PyUnicode_1BYTE_KIND) {
11698 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011699 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011700 }
11701 else if (kind == PyUnicode_2BYTE_KIND) {
11702 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011703 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011704 ucs2[n] = fill_char;
11705 } else {
11706 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11707 assert(kind == PyUnicode_4BYTE_KIND);
11708 for (n = 0; n < len; ++n)
11709 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011710 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 }
11712 else {
11713 /* number of characters copied this far */
11714 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011715 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011716 char *to = (char *) PyUnicode_DATA(u);
11717 Py_MEMCPY(to, PyUnicode_DATA(str),
11718 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011719 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720 n = (done <= nchars-done) ? done : nchars-done;
11721 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011722 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724 }
11725
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011726 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011727 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728}
11729
Alexander Belopolsky40018472011-02-26 01:02:56 +000011730PyObject *
11731PyUnicode_Replace(PyObject *obj,
11732 PyObject *subobj,
11733 PyObject *replobj,
11734 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735{
11736 PyObject *self;
11737 PyObject *str1;
11738 PyObject *str2;
11739 PyObject *result;
11740
11741 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011742 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011745 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011746 Py_DECREF(self);
11747 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748 }
11749 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011750 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011751 Py_DECREF(self);
11752 Py_DECREF(str1);
11753 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011755 if (PyUnicode_READY(self) == -1 ||
11756 PyUnicode_READY(str1) == -1 ||
11757 PyUnicode_READY(str2) == -1)
11758 result = NULL;
11759 else
11760 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761 Py_DECREF(self);
11762 Py_DECREF(str1);
11763 Py_DECREF(str2);
11764 return result;
11765}
11766
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011767PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011768 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769\n\
11770Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011771old replaced by new. If the optional argument count is\n\
11772given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773
11774static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 PyObject *str1;
11778 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011779 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780 PyObject *result;
11781
Martin v. Löwis18e16552006-02-15 17:27:45 +000011782 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011784 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011785 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011787 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 return NULL;
11789 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011790 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011791 Py_DECREF(str1);
11792 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011793 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011794 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11795 result = NULL;
11796 else
11797 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798
11799 Py_DECREF(str1);
11800 Py_DECREF(str2);
11801 return result;
11802}
11803
Alexander Belopolsky40018472011-02-26 01:02:56 +000011804static PyObject *
11805unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011807 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 Py_ssize_t isize;
11809 Py_ssize_t osize, squote, dquote, i, o;
11810 Py_UCS4 max, quote;
11811 int ikind, okind;
11812 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011814 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011815 return NULL;
11816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 isize = PyUnicode_GET_LENGTH(unicode);
11818 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820 /* Compute length of output, quote characters, and
11821 maximum character */
11822 osize = 2; /* quotes */
11823 max = 127;
11824 squote = dquote = 0;
11825 ikind = PyUnicode_KIND(unicode);
11826 for (i = 0; i < isize; i++) {
11827 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11828 switch (ch) {
11829 case '\'': squote++; osize++; break;
11830 case '"': dquote++; osize++; break;
11831 case '\\': case '\t': case '\r': case '\n':
11832 osize += 2; break;
11833 default:
11834 /* Fast-path ASCII */
11835 if (ch < ' ' || ch == 0x7f)
11836 osize += 4; /* \xHH */
11837 else if (ch < 0x7f)
11838 osize++;
11839 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11840 osize++;
11841 max = ch > max ? ch : max;
11842 }
11843 else if (ch < 0x100)
11844 osize += 4; /* \xHH */
11845 else if (ch < 0x10000)
11846 osize += 6; /* \uHHHH */
11847 else
11848 osize += 10; /* \uHHHHHHHH */
11849 }
11850 }
11851
11852 quote = '\'';
11853 if (squote) {
11854 if (dquote)
11855 /* Both squote and dquote present. Use squote,
11856 and escape them */
11857 osize += squote;
11858 else
11859 quote = '"';
11860 }
11861
11862 repr = PyUnicode_New(osize, max);
11863 if (repr == NULL)
11864 return NULL;
11865 okind = PyUnicode_KIND(repr);
11866 odata = PyUnicode_DATA(repr);
11867
11868 PyUnicode_WRITE(okind, odata, 0, quote);
11869 PyUnicode_WRITE(okind, odata, osize-1, quote);
11870
11871 for (i = 0, o = 1; i < isize; i++) {
11872 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011873
11874 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 if ((ch == quote) || (ch == '\\')) {
11876 PyUnicode_WRITE(okind, odata, o++, '\\');
11877 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011878 continue;
11879 }
11880
Benjamin Peterson29060642009-01-31 22:14:21 +000011881 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011882 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883 PyUnicode_WRITE(okind, odata, o++, '\\');
11884 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011885 }
11886 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 PyUnicode_WRITE(okind, odata, o++, '\\');
11888 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011889 }
11890 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 PyUnicode_WRITE(okind, odata, o++, '\\');
11892 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011893 }
11894
11895 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011896 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 PyUnicode_WRITE(okind, odata, o++, '\\');
11898 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011899 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11900 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011901 }
11902
Georg Brandl559e5d72008-06-11 18:37:52 +000011903 /* Copy ASCII characters as-is */
11904 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011906 }
11907
Benjamin Peterson29060642009-01-31 22:14:21 +000011908 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011909 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011910 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011911 (categories Z* and C* except ASCII space)
11912 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011914 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000011915 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011918 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11919 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011920 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011921 /* Map 16-bit characters to '\uxxxx' */
11922 else if (ch <= 0xffff) {
11923 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011924 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11925 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11926 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11927 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011928 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011929 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011930 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011931 PyUnicode_WRITE(okind, odata, o++, 'U');
11932 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11933 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11934 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11935 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020011936 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11937 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11938 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11939 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011940 }
11941 }
11942 /* Copy characters as-is */
11943 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011945 }
11946 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011947 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011949 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011950 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951}
11952
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011953PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955\n\
11956Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011957such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958arguments start and end are interpreted as in slice notation.\n\
11959\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011960Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011961
11962static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011965 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011966 Py_ssize_t start;
11967 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011968 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969
Jesus Ceaac451502011-04-20 17:09:23 +020011970 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11971 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011972 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 if (PyUnicode_READY(self) == -1)
11975 return NULL;
11976 if (PyUnicode_READY(substring) == -1)
11977 return NULL;
11978
Victor Stinner7931d9a2011-11-04 00:22:48 +010011979 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011980
11981 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 if (result == -2)
11984 return NULL;
11985
Christian Heimes217cfd12007-12-02 14:31:20 +000011986 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987}
11988
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011989PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011990 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011992Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993
11994static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011997 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011998 Py_ssize_t start;
11999 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012000 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001
Jesus Ceaac451502011-04-20 17:09:23 +020012002 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12003 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012004 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 if (PyUnicode_READY(self) == -1)
12007 return NULL;
12008 if (PyUnicode_READY(substring) == -1)
12009 return NULL;
12010
Victor Stinner7931d9a2011-11-04 00:22:48 +010012011 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012
12013 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 if (result == -2)
12016 return NULL;
12017
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018 if (result < 0) {
12019 PyErr_SetString(PyExc_ValueError, "substring not found");
12020 return NULL;
12021 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022
Christian Heimes217cfd12007-12-02 14:31:20 +000012023 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024}
12025
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012026PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012027 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012029Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012030done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012031
12032static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012033unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012035 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 Py_UCS4 fillchar = ' ';
12037
Victor Stinnere9a29352011-10-01 02:14:59 +020012038 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012040
Benjamin Petersonbac79492012-01-14 13:34:47 -050012041 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042 return NULL;
12043
Victor Stinnerc4b49542011-12-11 22:44:26 +010012044 if (PyUnicode_GET_LENGTH(self) >= width)
12045 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046
Victor Stinnerc4b49542011-12-11 22:44:26 +010012047 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048}
12049
Alexander Belopolsky40018472011-02-26 01:02:56 +000012050PyObject *
12051PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052{
12053 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012054
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055 s = PyUnicode_FromObject(s);
12056 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012057 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012058 if (sep != NULL) {
12059 sep = PyUnicode_FromObject(sep);
12060 if (sep == NULL) {
12061 Py_DECREF(s);
12062 return NULL;
12063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064 }
12065
Victor Stinner9310abb2011-10-05 00:59:23 +020012066 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067
12068 Py_DECREF(s);
12069 Py_XDECREF(sep);
12070 return result;
12071}
12072
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012073PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012074 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075\n\
12076Return a list of the words in S, using sep as the\n\
12077delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012078splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012079whitespace string is a separator and empty strings are\n\
12080removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081
12082static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012083unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012085 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012087 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012089 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12090 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091 return NULL;
12092
12093 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012094 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012096 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012098 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099}
12100
Thomas Wouters477c8d52006-05-27 19:21:47 +000012101PyObject *
12102PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12103{
12104 PyObject* str_obj;
12105 PyObject* sep_obj;
12106 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012107 int kind1, kind2, kind;
12108 void *buf1 = NULL, *buf2 = NULL;
12109 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012110
12111 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012112 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012113 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012114 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012115 if (!sep_obj) {
12116 Py_DECREF(str_obj);
12117 return NULL;
12118 }
12119 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12120 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012121 Py_DECREF(str_obj);
12122 return NULL;
12123 }
12124
Victor Stinner14f8f022011-10-05 20:58:25 +020012125 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012127 kind = Py_MAX(kind1, kind2);
12128 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012130 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 if (!buf1)
12132 goto onError;
12133 buf2 = PyUnicode_DATA(sep_obj);
12134 if (kind2 != kind)
12135 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12136 if (!buf2)
12137 goto onError;
12138 len1 = PyUnicode_GET_LENGTH(str_obj);
12139 len2 = PyUnicode_GET_LENGTH(sep_obj);
12140
Benjamin Petersonead6b532011-12-20 17:23:42 -060012141 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012142 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012143 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12144 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12145 else
12146 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012147 break;
12148 case PyUnicode_2BYTE_KIND:
12149 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12150 break;
12151 case PyUnicode_4BYTE_KIND:
12152 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12153 break;
12154 default:
12155 assert(0);
12156 out = 0;
12157 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012158
12159 Py_DECREF(sep_obj);
12160 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 if (kind1 != kind)
12162 PyMem_Free(buf1);
12163 if (kind2 != kind)
12164 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012165
12166 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012167 onError:
12168 Py_DECREF(sep_obj);
12169 Py_DECREF(str_obj);
12170 if (kind1 != kind && buf1)
12171 PyMem_Free(buf1);
12172 if (kind2 != kind && buf2)
12173 PyMem_Free(buf2);
12174 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012175}
12176
12177
12178PyObject *
12179PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12180{
12181 PyObject* str_obj;
12182 PyObject* sep_obj;
12183 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 int kind1, kind2, kind;
12185 void *buf1 = NULL, *buf2 = NULL;
12186 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012187
12188 str_obj = PyUnicode_FromObject(str_in);
12189 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012190 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012191 sep_obj = PyUnicode_FromObject(sep_in);
12192 if (!sep_obj) {
12193 Py_DECREF(str_obj);
12194 return NULL;
12195 }
12196
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 kind1 = PyUnicode_KIND(str_in);
12198 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012199 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 buf1 = PyUnicode_DATA(str_in);
12201 if (kind1 != kind)
12202 buf1 = _PyUnicode_AsKind(str_in, kind);
12203 if (!buf1)
12204 goto onError;
12205 buf2 = PyUnicode_DATA(sep_obj);
12206 if (kind2 != kind)
12207 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12208 if (!buf2)
12209 goto onError;
12210 len1 = PyUnicode_GET_LENGTH(str_obj);
12211 len2 = PyUnicode_GET_LENGTH(sep_obj);
12212
Benjamin Petersonead6b532011-12-20 17:23:42 -060012213 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012215 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12216 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12217 else
12218 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 break;
12220 case PyUnicode_2BYTE_KIND:
12221 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12222 break;
12223 case PyUnicode_4BYTE_KIND:
12224 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12225 break;
12226 default:
12227 assert(0);
12228 out = 0;
12229 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012230
12231 Py_DECREF(sep_obj);
12232 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012233 if (kind1 != kind)
12234 PyMem_Free(buf1);
12235 if (kind2 != kind)
12236 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012237
12238 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012239 onError:
12240 Py_DECREF(sep_obj);
12241 Py_DECREF(str_obj);
12242 if (kind1 != kind && buf1)
12243 PyMem_Free(buf1);
12244 if (kind2 != kind && buf2)
12245 PyMem_Free(buf2);
12246 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012247}
12248
12249PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012250 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012251\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012252Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012253the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012254found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012255
12256static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012257unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012258{
Victor Stinner9310abb2011-10-05 00:59:23 +020012259 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012260}
12261
12262PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012263 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012264\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012265Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012266the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012267separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012268
12269static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012270unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012271{
Victor Stinner9310abb2011-10-05 00:59:23 +020012272 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012273}
12274
Alexander Belopolsky40018472011-02-26 01:02:56 +000012275PyObject *
12276PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012277{
12278 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012279
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012280 s = PyUnicode_FromObject(s);
12281 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012282 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012283 if (sep != NULL) {
12284 sep = PyUnicode_FromObject(sep);
12285 if (sep == NULL) {
12286 Py_DECREF(s);
12287 return NULL;
12288 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012289 }
12290
Victor Stinner9310abb2011-10-05 00:59:23 +020012291 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012292
12293 Py_DECREF(s);
12294 Py_XDECREF(sep);
12295 return result;
12296}
12297
12298PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012299 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012300\n\
12301Return a list of the words in S, using sep as the\n\
12302delimiter string, starting at the end of the string and\n\
12303working to the front. If maxsplit is given, at most maxsplit\n\
12304splits are done. If sep is not specified, any whitespace string\n\
12305is a separator.");
12306
12307static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012308unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012309{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012310 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012311 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012312 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012313
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012314 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12315 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012316 return NULL;
12317
12318 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012319 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012320 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012321 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012322 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012323 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012324}
12325
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012326PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012327 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012328\n\
12329Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012330Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012331is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012332
12333static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012334unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012335{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012336 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012337 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012338
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012339 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12340 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012341 return NULL;
12342
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012343 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012344}
12345
12346static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012347PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012348{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012349 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012350}
12351
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012352PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012353 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012354\n\
12355Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012356and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012357
12358static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012359unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012360{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012361 if (PyUnicode_READY(self) == -1)
12362 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012363 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364}
12365
Georg Brandlceee0772007-11-27 23:48:05 +000012366PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012367 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012368\n\
12369Return a translation table usable for str.translate().\n\
12370If there is only one argument, it must be a dictionary mapping Unicode\n\
12371ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012372Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012373If there are two arguments, they must be strings of equal length, and\n\
12374in the resulting dictionary, each character in x will be mapped to the\n\
12375character at the same position in y. If there is a third argument, it\n\
12376must be a string, whose characters will be mapped to None in the result.");
12377
12378static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012379unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012380{
12381 PyObject *x, *y = NULL, *z = NULL;
12382 PyObject *new = NULL, *key, *value;
12383 Py_ssize_t i = 0;
12384 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012385
Georg Brandlceee0772007-11-27 23:48:05 +000012386 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12387 return NULL;
12388 new = PyDict_New();
12389 if (!new)
12390 return NULL;
12391 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012392 int x_kind, y_kind, z_kind;
12393 void *x_data, *y_data, *z_data;
12394
Georg Brandlceee0772007-11-27 23:48:05 +000012395 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012396 if (!PyUnicode_Check(x)) {
12397 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12398 "be a string if there is a second argument");
12399 goto err;
12400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012401 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012402 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12403 "arguments must have equal length");
12404 goto err;
12405 }
12406 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012407 x_kind = PyUnicode_KIND(x);
12408 y_kind = PyUnicode_KIND(y);
12409 x_data = PyUnicode_DATA(x);
12410 y_data = PyUnicode_DATA(y);
12411 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12412 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012413 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012414 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012415 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012416 if (!value) {
12417 Py_DECREF(key);
12418 goto err;
12419 }
Georg Brandlceee0772007-11-27 23:48:05 +000012420 res = PyDict_SetItem(new, key, value);
12421 Py_DECREF(key);
12422 Py_DECREF(value);
12423 if (res < 0)
12424 goto err;
12425 }
12426 /* create entries for deleting chars in z */
12427 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012428 z_kind = PyUnicode_KIND(z);
12429 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012430 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012431 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012432 if (!key)
12433 goto err;
12434 res = PyDict_SetItem(new, key, Py_None);
12435 Py_DECREF(key);
12436 if (res < 0)
12437 goto err;
12438 }
12439 }
12440 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 int kind;
12442 void *data;
12443
Georg Brandlceee0772007-11-27 23:48:05 +000012444 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012445 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012446 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12447 "to maketrans it must be a dict");
12448 goto err;
12449 }
12450 /* copy entries into the new dict, converting string keys to int keys */
12451 while (PyDict_Next(x, &i, &key, &value)) {
12452 if (PyUnicode_Check(key)) {
12453 /* convert string keys to integer keys */
12454 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012455 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012456 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12457 "table must be of length 1");
12458 goto err;
12459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012460 kind = PyUnicode_KIND(key);
12461 data = PyUnicode_DATA(key);
12462 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012463 if (!newkey)
12464 goto err;
12465 res = PyDict_SetItem(new, newkey, value);
12466 Py_DECREF(newkey);
12467 if (res < 0)
12468 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012469 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012470 /* just keep integer keys */
12471 if (PyDict_SetItem(new, key, value) < 0)
12472 goto err;
12473 } else {
12474 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12475 "be strings or integers");
12476 goto err;
12477 }
12478 }
12479 }
12480 return new;
12481 err:
12482 Py_DECREF(new);
12483 return NULL;
12484}
12485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012486PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012487 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012488\n\
12489Return a copy of the string S, where all characters have been mapped\n\
12490through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012491Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012492Unmapped characters are left untouched. Characters mapped to None\n\
12493are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012494
12495static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012498 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499}
12500
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012501PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012502 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012503\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012504Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505
12506static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012507unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012509 if (PyUnicode_READY(self) == -1)
12510 return NULL;
12511 if (PyUnicode_IS_ASCII(self))
12512 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012513 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012514}
12515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012516PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012517 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012519Pad a numeric string S with zeros on the left, to fill a field\n\
12520of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521
12522static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012523unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012525 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012526 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012527 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 int kind;
12529 void *data;
12530 Py_UCS4 chr;
12531
Martin v. Löwis18e16552006-02-15 17:27:45 +000012532 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533 return NULL;
12534
Benjamin Petersonbac79492012-01-14 13:34:47 -050012535 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012536 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537
Victor Stinnerc4b49542011-12-11 22:44:26 +010012538 if (PyUnicode_GET_LENGTH(self) >= width)
12539 return unicode_result_unchanged(self);
12540
12541 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542
12543 u = pad(self, fill, 0, '0');
12544
Walter Dörwald068325e2002-04-15 13:36:47 +000012545 if (u == NULL)
12546 return NULL;
12547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 kind = PyUnicode_KIND(u);
12549 data = PyUnicode_DATA(u);
12550 chr = PyUnicode_READ(kind, data, fill);
12551
12552 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012554 PyUnicode_WRITE(kind, data, 0, chr);
12555 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556 }
12557
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012558 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012559 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561
12562#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012563static PyObject *
12564unicode__decimal2ascii(PyObject *self)
12565{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012567}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568#endif
12569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012570PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012571 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012573Return True if S starts with the specified prefix, False otherwise.\n\
12574With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012575With optional end, stop comparing S at that position.\n\
12576prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577
12578static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012579unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012580 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012582 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012583 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012584 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012585 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012586 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587
Jesus Ceaac451502011-04-20 17:09:23 +020012588 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012589 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012590 if (PyTuple_Check(subobj)) {
12591 Py_ssize_t i;
12592 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012593 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012594 if (substring == NULL)
12595 return NULL;
12596 result = tailmatch(self, substring, start, end, -1);
12597 Py_DECREF(substring);
12598 if (result) {
12599 Py_RETURN_TRUE;
12600 }
12601 }
12602 /* nothing matched */
12603 Py_RETURN_FALSE;
12604 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012605 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012606 if (substring == NULL) {
12607 if (PyErr_ExceptionMatches(PyExc_TypeError))
12608 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12609 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012610 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012611 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012612 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012614 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012615}
12616
12617
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012618PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012619 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012621Return True if S ends with the specified suffix, False otherwise.\n\
12622With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012623With optional end, stop comparing S at that position.\n\
12624suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625
12626static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012627unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012628 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012630 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012631 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012632 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012633 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012634 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635
Jesus Ceaac451502011-04-20 17:09:23 +020012636 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012637 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012638 if (PyTuple_Check(subobj)) {
12639 Py_ssize_t i;
12640 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012641 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012642 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012643 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012644 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012645 result = tailmatch(self, substring, start, end, +1);
12646 Py_DECREF(substring);
12647 if (result) {
12648 Py_RETURN_TRUE;
12649 }
12650 }
12651 Py_RETURN_FALSE;
12652 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012653 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012654 if (substring == NULL) {
12655 if (PyErr_ExceptionMatches(PyExc_TypeError))
12656 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12657 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012658 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012659 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012660 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012662 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663}
12664
Victor Stinner202fdca2012-05-07 12:47:02 +020012665Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012666_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012667{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012668 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012669 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12670 writer->data = PyUnicode_DATA(writer->buffer);
12671 writer->kind = PyUnicode_KIND(writer->buffer);
12672}
12673
Victor Stinnerd3f08822012-05-29 12:57:52 +020012674void
12675_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012676{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012677 memset(writer, 0, sizeof(*writer));
12678#ifdef Py_DEBUG
12679 writer->kind = 5; /* invalid kind */
12680#endif
12681 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012682 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012683}
12684
Victor Stinnerd3f08822012-05-29 12:57:52 +020012685int
12686_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12687 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012688{
12689 Py_ssize_t newlen;
12690 PyObject *newbuffer;
12691
Victor Stinnerd3f08822012-05-29 12:57:52 +020012692 assert(length > 0);
12693
Victor Stinner202fdca2012-05-07 12:47:02 +020012694 if (length > PY_SSIZE_T_MAX - writer->pos) {
12695 PyErr_NoMemory();
12696 return -1;
12697 }
12698 newlen = writer->pos + length;
12699
Victor Stinnerd3f08822012-05-29 12:57:52 +020012700 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012701 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012702 /* overallocate 25% to limit the number of resize */
12703 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12704 newlen += newlen / 4;
12705 if (newlen < writer->min_length)
12706 newlen = writer->min_length;
12707 }
12708 writer->buffer = PyUnicode_New(newlen, maxchar);
12709 if (writer->buffer == NULL)
12710 return -1;
12711 _PyUnicodeWriter_Update(writer);
12712 return 0;
12713 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012714
Victor Stinnerd3f08822012-05-29 12:57:52 +020012715 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012716 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012717 /* overallocate 25% to limit the number of resize */
12718 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12719 newlen += newlen / 4;
12720 if (newlen < writer->min_length)
12721 newlen = writer->min_length;
12722 }
12723
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012724 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012725 /* resize + widen */
12726 newbuffer = PyUnicode_New(newlen, maxchar);
12727 if (newbuffer == NULL)
12728 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012729 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12730 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012731 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012732 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012733 }
12734 else {
12735 newbuffer = resize_compact(writer->buffer, newlen);
12736 if (newbuffer == NULL)
12737 return -1;
12738 }
12739 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012740 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012741 }
12742 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012743 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012744 newbuffer = PyUnicode_New(writer->size, maxchar);
12745 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012746 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012747 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12748 writer->buffer, 0, writer->pos);
12749 Py_DECREF(writer->buffer);
12750 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012751 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012752 }
12753 return 0;
12754}
12755
Victor Stinnerd3f08822012-05-29 12:57:52 +020012756int
12757_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12758{
12759 Py_UCS4 maxchar;
12760 Py_ssize_t len;
12761
12762 if (PyUnicode_READY(str) == -1)
12763 return -1;
12764 len = PyUnicode_GET_LENGTH(str);
12765 if (len == 0)
12766 return 0;
12767 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12768 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012769 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012770 Py_INCREF(str);
12771 writer->buffer = str;
12772 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012773 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012774 writer->size = 0;
12775 writer->pos += len;
12776 return 0;
12777 }
12778 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12779 return -1;
12780 }
12781 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12782 str, 0, len);
12783 writer->pos += len;
12784 return 0;
12785}
12786
Victor Stinnere215d962012-10-06 23:03:36 +020012787int
12788_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
12789{
12790 Py_UCS4 maxchar;
12791
12792 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
12793 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
12794 return -1;
12795 unicode_write_cstr(writer->buffer, writer->pos, str, len);
12796 writer->pos += len;
12797 return 0;
12798}
12799
Victor Stinnerd3f08822012-05-29 12:57:52 +020012800PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012801_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012802{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012803 if (writer->pos == 0) {
12804 Py_XDECREF(writer->buffer);
12805 Py_INCREF(unicode_empty);
12806 return unicode_empty;
12807 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012808 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012809 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12810 return writer->buffer;
12811 }
12812 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12813 PyObject *newbuffer;
12814 newbuffer = resize_compact(writer->buffer, writer->pos);
12815 if (newbuffer == NULL) {
12816 Py_DECREF(writer->buffer);
12817 return NULL;
12818 }
12819 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012820 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012821 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012822 return writer->buffer;
12823}
12824
Victor Stinnerd3f08822012-05-29 12:57:52 +020012825void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012826_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012827{
12828 Py_CLEAR(writer->buffer);
12829}
12830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012831#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012832
12833PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012834 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012835\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012836Return a formatted version of S, using substitutions from args and kwargs.\n\
12837The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012838
Eric Smith27bbca62010-11-04 17:06:58 +000012839PyDoc_STRVAR(format_map__doc__,
12840 "S.format_map(mapping) -> str\n\
12841\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012842Return a formatted version of S, using substitutions from mapping.\n\
12843The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012844
Eric Smith4a7d76d2008-05-30 18:10:19 +000012845static PyObject *
12846unicode__format__(PyObject* self, PyObject* args)
12847{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012848 PyObject *format_spec;
12849 _PyUnicodeWriter writer;
12850 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012851
12852 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12853 return NULL;
12854
Victor Stinnerd3f08822012-05-29 12:57:52 +020012855 if (PyUnicode_READY(self) == -1)
12856 return NULL;
12857 _PyUnicodeWriter_Init(&writer, 0);
12858 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12859 self, format_spec, 0,
12860 PyUnicode_GET_LENGTH(format_spec));
12861 if (ret == -1) {
12862 _PyUnicodeWriter_Dealloc(&writer);
12863 return NULL;
12864 }
12865 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012866}
12867
Eric Smith8c663262007-08-25 02:26:07 +000012868PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012869 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012870\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012871Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012872
12873static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012874unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012875{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012876 Py_ssize_t size;
12877
12878 /* If it's a compact object, account for base structure +
12879 character data. */
12880 if (PyUnicode_IS_COMPACT_ASCII(v))
12881 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12882 else if (PyUnicode_IS_COMPACT(v))
12883 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012884 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012885 else {
12886 /* If it is a two-block object, account for base object, and
12887 for character block if present. */
12888 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012889 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012890 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012891 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 }
12893 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012894 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012895 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012896 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012897 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012898 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012899
12900 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012901}
12902
12903PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012904 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012905
12906static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012907unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012908{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010012909 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012910 if (!copy)
12911 return NULL;
12912 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012913}
12914
Guido van Rossumd57fd912000-03-10 22:53:23 +000012915static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012916 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012917 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012918 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
12919 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012920 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12921 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050012922 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012923 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12924 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12925 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12926 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12927 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012928 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012929 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12930 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12931 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012932 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012933 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12934 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12935 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012936 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012937 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012938 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012939 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012940 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12941 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12942 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12943 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12944 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12945 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12946 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12947 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12948 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12949 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12950 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12951 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12952 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12953 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012954 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012955 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012956 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012957 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012958 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012959 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012960 {"maketrans", (PyCFunction) unicode_maketrans,
12961 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012962 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012963#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012964 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012965 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012966#endif
12967
Benjamin Peterson14339b62009-01-31 16:36:08 +000012968 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012969 {NULL, NULL}
12970};
12971
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012972static PyObject *
12973unicode_mod(PyObject *v, PyObject *w)
12974{
Brian Curtindfc80e32011-08-10 20:28:54 -050012975 if (!PyUnicode_Check(v))
12976 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012977 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012978}
12979
12980static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012981 0, /*nb_add*/
12982 0, /*nb_subtract*/
12983 0, /*nb_multiply*/
12984 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012985};
12986
Guido van Rossumd57fd912000-03-10 22:53:23 +000012987static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012988 (lenfunc) unicode_length, /* sq_length */
12989 PyUnicode_Concat, /* sq_concat */
12990 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12991 (ssizeargfunc) unicode_getitem, /* sq_item */
12992 0, /* sq_slice */
12993 0, /* sq_ass_item */
12994 0, /* sq_ass_slice */
12995 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012996};
12997
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012998static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012999unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013000{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013001 if (PyUnicode_READY(self) == -1)
13002 return NULL;
13003
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013004 if (PyIndex_Check(item)) {
13005 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013006 if (i == -1 && PyErr_Occurred())
13007 return NULL;
13008 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013010 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013011 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013012 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013013 PyObject *result;
13014 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013015 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013016 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013018 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013019 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013020 return NULL;
13021 }
13022
13023 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013024 Py_INCREF(unicode_empty);
13025 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013026 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013027 slicelength == PyUnicode_GET_LENGTH(self)) {
13028 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013029 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013030 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013031 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013032 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013033 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013034 src_kind = PyUnicode_KIND(self);
13035 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013036 if (!PyUnicode_IS_ASCII(self)) {
13037 kind_limit = kind_maxchar_limit(src_kind);
13038 max_char = 0;
13039 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13040 ch = PyUnicode_READ(src_kind, src_data, cur);
13041 if (ch > max_char) {
13042 max_char = ch;
13043 if (max_char >= kind_limit)
13044 break;
13045 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013046 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013047 }
Victor Stinner55c99112011-10-13 01:17:06 +020013048 else
13049 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013050 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013051 if (result == NULL)
13052 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013053 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013054 dest_data = PyUnicode_DATA(result);
13055
13056 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013057 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13058 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013059 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013060 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013061 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013062 } else {
13063 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13064 return NULL;
13065 }
13066}
13067
13068static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013069 (lenfunc)unicode_length, /* mp_length */
13070 (binaryfunc)unicode_subscript, /* mp_subscript */
13071 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013072};
13073
Guido van Rossumd57fd912000-03-10 22:53:23 +000013074
Guido van Rossumd57fd912000-03-10 22:53:23 +000013075/* Helpers for PyUnicode_Format() */
13076
Victor Stinnera47082312012-10-04 02:19:54 +020013077struct unicode_formatter_t {
13078 PyObject *args;
13079 int args_owned;
13080 Py_ssize_t arglen, argidx;
13081 PyObject *dict;
13082
13083 enum PyUnicode_Kind fmtkind;
13084 Py_ssize_t fmtcnt, fmtpos;
13085 void *fmtdata;
13086 PyObject *fmtstr;
13087
13088 _PyUnicodeWriter writer;
13089};
13090
13091struct unicode_format_arg_t {
13092 Py_UCS4 ch;
13093 int flags;
13094 Py_ssize_t width;
13095 int prec;
13096 int sign;
13097};
13098
Guido van Rossumd57fd912000-03-10 22:53:23 +000013099static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013100unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101{
Victor Stinnera47082312012-10-04 02:19:54 +020013102 Py_ssize_t argidx = ctx->argidx;
13103
13104 if (argidx < ctx->arglen) {
13105 ctx->argidx++;
13106 if (ctx->arglen < 0)
13107 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013108 else
Victor Stinnera47082312012-10-04 02:19:54 +020013109 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110 }
13111 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013112 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113 return NULL;
13114}
13115
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013116/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117
Victor Stinnera47082312012-10-04 02:19:54 +020013118/* Format a float into the writer if the writer is not NULL, or into *p_output
13119 otherwise.
13120
13121 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013122static int
Victor Stinnera47082312012-10-04 02:19:54 +020013123formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13124 PyObject **p_output,
13125 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013127 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013129 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013130 int prec;
13131 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013132
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133 x = PyFloat_AsDouble(v);
13134 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013135 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013136
Victor Stinnera47082312012-10-04 02:19:54 +020013137 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013139 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013140
Victor Stinnera47082312012-10-04 02:19:54 +020013141 if (arg->flags & F_ALT)
13142 dtoa_flags = Py_DTSF_ALT;
13143 else
13144 dtoa_flags = 0;
13145 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013146 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013147 return -1;
13148 len = strlen(p);
13149 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013150 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13151 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013152 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013153 }
Victor Stinner184252a2012-06-16 02:57:41 +020013154 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013155 writer->pos += len;
13156 }
13157 else
13158 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013159 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013160 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013161}
13162
Victor Stinnerd0880d52012-04-27 23:40:13 +020013163/* formatlong() emulates the format codes d, u, o, x and X, and
13164 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13165 * Python's regular ints.
13166 * Return value: a new PyUnicodeObject*, or NULL if error.
13167 * The output string is of the form
13168 * "-"? ("0x" | "0X")? digit+
13169 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13170 * set in flags. The case of hex digits will be correct,
13171 * There will be at least prec digits, zero-filled on the left if
13172 * necessary to get that many.
13173 * val object to be converted
13174 * flags bitmask of format flags; only F_ALT is looked at
13175 * prec minimum number of digits; 0-fill on left if needed
13176 * type a character in [duoxX]; u acts the same as d
13177 *
13178 * CAUTION: o, x and X conversions on regular ints can never
13179 * produce a '-' sign, but can for Python's unbounded ints.
13180 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013181static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013182formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013183{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013184 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013185 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013186 Py_ssize_t i;
13187 int sign; /* 1 if '-', else 0 */
13188 int len; /* number of characters */
13189 Py_ssize_t llen;
13190 int numdigits; /* len == numnondigits + numdigits */
13191 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013192 int prec = arg->prec;
13193 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013194
Victor Stinnerd0880d52012-04-27 23:40:13 +020013195 /* Avoid exceeding SSIZE_T_MAX */
13196 if (prec > INT_MAX-3) {
13197 PyErr_SetString(PyExc_OverflowError,
13198 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013199 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013200 }
13201
13202 assert(PyLong_Check(val));
13203
13204 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013205 default:
13206 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013207 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013208 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013209 case 'u':
13210 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013211 if (PyBool_Check(val))
13212 result = PyNumber_ToBase(val, 10);
13213 else
13214 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013215 break;
13216 case 'o':
13217 numnondigits = 2;
13218 result = PyNumber_ToBase(val, 8);
13219 break;
13220 case 'x':
13221 case 'X':
13222 numnondigits = 2;
13223 result = PyNumber_ToBase(val, 16);
13224 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013225 }
13226 if (!result)
13227 return NULL;
13228
13229 assert(unicode_modifiable(result));
13230 assert(PyUnicode_IS_READY(result));
13231 assert(PyUnicode_IS_ASCII(result));
13232
13233 /* To modify the string in-place, there can only be one reference. */
13234 if (Py_REFCNT(result) != 1) {
13235 PyErr_BadInternalCall();
13236 return NULL;
13237 }
13238 buf = PyUnicode_DATA(result);
13239 llen = PyUnicode_GET_LENGTH(result);
13240 if (llen > INT_MAX) {
13241 PyErr_SetString(PyExc_ValueError,
13242 "string too large in _PyBytes_FormatLong");
13243 return NULL;
13244 }
13245 len = (int)llen;
13246 sign = buf[0] == '-';
13247 numnondigits += sign;
13248 numdigits = len - numnondigits;
13249 assert(numdigits > 0);
13250
13251 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013252 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013253 (type == 'o' || type == 'x' || type == 'X'))) {
13254 assert(buf[sign] == '0');
13255 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13256 buf[sign+1] == 'o');
13257 numnondigits -= 2;
13258 buf += 2;
13259 len -= 2;
13260 if (sign)
13261 buf[0] = '-';
13262 assert(len == numnondigits + numdigits);
13263 assert(numdigits > 0);
13264 }
13265
13266 /* Fill with leading zeroes to meet minimum width. */
13267 if (prec > numdigits) {
13268 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13269 numnondigits + prec);
13270 char *b1;
13271 if (!r1) {
13272 Py_DECREF(result);
13273 return NULL;
13274 }
13275 b1 = PyBytes_AS_STRING(r1);
13276 for (i = 0; i < numnondigits; ++i)
13277 *b1++ = *buf++;
13278 for (i = 0; i < prec - numdigits; i++)
13279 *b1++ = '0';
13280 for (i = 0; i < numdigits; i++)
13281 *b1++ = *buf++;
13282 *b1 = '\0';
13283 Py_DECREF(result);
13284 result = r1;
13285 buf = PyBytes_AS_STRING(result);
13286 len = numnondigits + prec;
13287 }
13288
13289 /* Fix up case for hex conversions. */
13290 if (type == 'X') {
13291 /* Need to convert all lower case letters to upper case.
13292 and need to convert 0x to 0X (and -0x to -0X). */
13293 for (i = 0; i < len; i++)
13294 if (buf[i] >= 'a' && buf[i] <= 'x')
13295 buf[i] -= 'a'-'A';
13296 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013297 if (!PyUnicode_Check(result)
13298 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013299 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013300 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013301 Py_DECREF(result);
13302 result = unicode;
13303 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013304 else if (len != PyUnicode_GET_LENGTH(result)) {
13305 if (PyUnicode_Resize(&result, len) < 0)
13306 Py_CLEAR(result);
13307 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013308 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013309}
13310
Victor Stinner621ef3d2012-10-02 00:33:47 +020013311/* Format an integer.
13312 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013313 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013314 * -1 and raise an exception on error */
13315static int
Victor Stinnera47082312012-10-04 02:19:54 +020013316mainformatlong(PyObject *v,
13317 struct unicode_format_arg_t *arg,
13318 PyObject **p_output,
13319 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013320{
13321 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013322 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013323
13324 if (!PyNumber_Check(v))
13325 goto wrongtype;
13326
13327 if (!PyLong_Check(v)) {
13328 iobj = PyNumber_Long(v);
13329 if (iobj == NULL) {
13330 if (PyErr_ExceptionMatches(PyExc_TypeError))
13331 goto wrongtype;
13332 return -1;
13333 }
13334 assert(PyLong_Check(iobj));
13335 }
13336 else {
13337 iobj = v;
13338 Py_INCREF(iobj);
13339 }
13340
13341 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013342 && arg->width == -1 && arg->prec == -1
13343 && !(arg->flags & (F_SIGN | F_BLANK))
13344 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013345 {
13346 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013347 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013348 int base;
13349
Victor Stinnera47082312012-10-04 02:19:54 +020013350 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013351 {
13352 default:
13353 assert(0 && "'type' not in [diuoxX]");
13354 case 'd':
13355 case 'i':
13356 case 'u':
13357 base = 10;
13358 break;
13359 case 'o':
13360 base = 8;
13361 break;
13362 case 'x':
13363 case 'X':
13364 base = 16;
13365 break;
13366 }
13367
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013368 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13369 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013370 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013371 }
13372 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013373 return 1;
13374 }
13375
Victor Stinnera47082312012-10-04 02:19:54 +020013376 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013377 Py_DECREF(iobj);
13378 if (res == NULL)
13379 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013380 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013381 return 0;
13382
13383wrongtype:
13384 PyErr_Format(PyExc_TypeError,
13385 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013386 "not %.200s",
13387 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013388 return -1;
13389}
13390
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013391static Py_UCS4
13392formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013393{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013394 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013395 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013396 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013397 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013398 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 goto onError;
13400 }
13401 else {
13402 /* Integer input truncated to a character */
13403 long x;
13404 x = PyLong_AsLong(v);
13405 if (x == -1 && PyErr_Occurred())
13406 goto onError;
13407
Victor Stinner8faf8212011-12-08 22:14:11 +010013408 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013409 PyErr_SetString(PyExc_OverflowError,
13410 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013411 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013412 }
13413
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013414 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013415 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013416
Benjamin Peterson29060642009-01-31 22:14:21 +000013417 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013418 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013419 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013420 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013421}
13422
Victor Stinnera47082312012-10-04 02:19:54 +020013423/* Parse options of an argument: flags, width, precision.
13424 Handle also "%(name)" syntax.
13425
13426 Return 0 if the argument has been formatted into arg->str.
13427 Return 1 if the argument has been written into ctx->writer,
13428 Raise an exception and return -1 on error. */
13429static int
13430unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13431 struct unicode_format_arg_t *arg)
13432{
13433#define FORMAT_READ(ctx) \
13434 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13435
13436 PyObject *v;
13437
13438 arg->ch = FORMAT_READ(ctx);
13439 if (arg->ch == '(') {
13440 /* Get argument value from a dictionary. Example: "%(name)s". */
13441 Py_ssize_t keystart;
13442 Py_ssize_t keylen;
13443 PyObject *key;
13444 int pcount = 1;
13445
13446 if (ctx->dict == NULL) {
13447 PyErr_SetString(PyExc_TypeError,
13448 "format requires a mapping");
13449 return -1;
13450 }
13451 ++ctx->fmtpos;
13452 --ctx->fmtcnt;
13453 keystart = ctx->fmtpos;
13454 /* Skip over balanced parentheses */
13455 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13456 arg->ch = FORMAT_READ(ctx);
13457 if (arg->ch == ')')
13458 --pcount;
13459 else if (arg->ch == '(')
13460 ++pcount;
13461 ctx->fmtpos++;
13462 }
13463 keylen = ctx->fmtpos - keystart - 1;
13464 if (ctx->fmtcnt < 0 || pcount > 0) {
13465 PyErr_SetString(PyExc_ValueError,
13466 "incomplete format key");
13467 return -1;
13468 }
13469 key = PyUnicode_Substring(ctx->fmtstr,
13470 keystart, keystart + keylen);
13471 if (key == NULL)
13472 return -1;
13473 if (ctx->args_owned) {
13474 Py_DECREF(ctx->args);
13475 ctx->args_owned = 0;
13476 }
13477 ctx->args = PyObject_GetItem(ctx->dict, key);
13478 Py_DECREF(key);
13479 if (ctx->args == NULL)
13480 return -1;
13481 ctx->args_owned = 1;
13482 ctx->arglen = -1;
13483 ctx->argidx = -2;
13484 }
13485
13486 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
13487 arg->flags = 0;
13488 while (--ctx->fmtcnt >= 0) {
13489 arg->ch = FORMAT_READ(ctx);
13490 ctx->fmtpos++;
13491 switch (arg->ch) {
13492 case '-': arg->flags |= F_LJUST; continue;
13493 case '+': arg->flags |= F_SIGN; continue;
13494 case ' ': arg->flags |= F_BLANK; continue;
13495 case '#': arg->flags |= F_ALT; continue;
13496 case '0': arg->flags |= F_ZERO; continue;
13497 }
13498 break;
13499 }
13500
13501 /* Parse width. Example: "%10s" => width=10 */
13502 arg->width = -1;
13503 if (arg->ch == '*') {
13504 v = unicode_format_getnextarg(ctx);
13505 if (v == NULL)
13506 return -1;
13507 if (!PyLong_Check(v)) {
13508 PyErr_SetString(PyExc_TypeError,
13509 "* wants int");
13510 return -1;
13511 }
13512 arg->width = PyLong_AsLong(v);
13513 if (arg->width == -1 && PyErr_Occurred())
13514 return -1;
13515 if (arg->width < 0) {
13516 arg->flags |= F_LJUST;
13517 arg->width = -arg->width;
13518 }
13519 if (--ctx->fmtcnt >= 0) {
13520 arg->ch = FORMAT_READ(ctx);
13521 ctx->fmtpos++;
13522 }
13523 }
13524 else if (arg->ch >= '0' && arg->ch <= '9') {
13525 arg->width = arg->ch - '0';
13526 while (--ctx->fmtcnt >= 0) {
13527 arg->ch = FORMAT_READ(ctx);
13528 ctx->fmtpos++;
13529 if (arg->ch < '0' || arg->ch > '9')
13530 break;
13531 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13532 mixing signed and unsigned comparison. Since arg->ch is between
13533 '0' and '9', casting to int is safe. */
13534 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13535 PyErr_SetString(PyExc_ValueError,
13536 "width too big");
13537 return -1;
13538 }
13539 arg->width = arg->width*10 + (arg->ch - '0');
13540 }
13541 }
13542
13543 /* Parse precision. Example: "%.3f" => prec=3 */
13544 arg->prec = -1;
13545 if (arg->ch == '.') {
13546 arg->prec = 0;
13547 if (--ctx->fmtcnt >= 0) {
13548 arg->ch = FORMAT_READ(ctx);
13549 ctx->fmtpos++;
13550 }
13551 if (arg->ch == '*') {
13552 v = unicode_format_getnextarg(ctx);
13553 if (v == NULL)
13554 return -1;
13555 if (!PyLong_Check(v)) {
13556 PyErr_SetString(PyExc_TypeError,
13557 "* wants int");
13558 return -1;
13559 }
13560 arg->prec = PyLong_AsLong(v);
13561 if (arg->prec == -1 && PyErr_Occurred())
13562 return -1;
13563 if (arg->prec < 0)
13564 arg->prec = 0;
13565 if (--ctx->fmtcnt >= 0) {
13566 arg->ch = FORMAT_READ(ctx);
13567 ctx->fmtpos++;
13568 }
13569 }
13570 else if (arg->ch >= '0' && arg->ch <= '9') {
13571 arg->prec = arg->ch - '0';
13572 while (--ctx->fmtcnt >= 0) {
13573 arg->ch = FORMAT_READ(ctx);
13574 ctx->fmtpos++;
13575 if (arg->ch < '0' || arg->ch > '9')
13576 break;
13577 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13578 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013579 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013580 return -1;
13581 }
13582 arg->prec = arg->prec*10 + (arg->ch - '0');
13583 }
13584 }
13585 }
13586
13587 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13588 if (ctx->fmtcnt >= 0) {
13589 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13590 if (--ctx->fmtcnt >= 0) {
13591 arg->ch = FORMAT_READ(ctx);
13592 ctx->fmtpos++;
13593 }
13594 }
13595 }
13596 if (ctx->fmtcnt < 0) {
13597 PyErr_SetString(PyExc_ValueError,
13598 "incomplete format");
13599 return -1;
13600 }
13601 return 0;
13602
13603#undef FORMAT_READ
13604}
13605
13606/* Format one argument. Supported conversion specifiers:
13607
13608 - "s", "r", "a": any type
13609 - "i", "d", "u", "o", "x", "X": int
13610 - "e", "E", "f", "F", "g", "G": float
13611 - "c": int or str (1 character)
13612
13613 Return 0 if the argument has been formatted into *p_str,
13614 1 if the argument has been written into ctx->writer,
13615 -1 on error. */
13616static int
13617unicode_format_arg_format(struct unicode_formatter_t *ctx,
13618 struct unicode_format_arg_t *arg,
13619 PyObject **p_str)
13620{
13621 PyObject *v;
13622 _PyUnicodeWriter *writer = &ctx->writer;
13623
13624 if (ctx->fmtcnt == 0)
13625 ctx->writer.overallocate = 0;
13626
13627 if (arg->ch == '%') {
13628 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1)
13629 return -1;
13630 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
13631 writer->pos += 1;
13632 return 1;
13633 }
13634
13635 v = unicode_format_getnextarg(ctx);
13636 if (v == NULL)
13637 return -1;
13638
13639 arg->sign = 0;
13640
13641 switch (arg->ch) {
13642
13643 case 's':
13644 case 'r':
13645 case 'a':
13646 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13647 /* Fast path */
13648 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13649 return -1;
13650 return 1;
13651 }
13652
13653 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13654 *p_str = v;
13655 Py_INCREF(*p_str);
13656 }
13657 else {
13658 if (arg->ch == 's')
13659 *p_str = PyObject_Str(v);
13660 else if (arg->ch == 'r')
13661 *p_str = PyObject_Repr(v);
13662 else
13663 *p_str = PyObject_ASCII(v);
13664 }
13665 break;
13666
13667 case 'i':
13668 case 'd':
13669 case 'u':
13670 case 'o':
13671 case 'x':
13672 case 'X':
13673 {
13674 int ret = mainformatlong(v, arg, p_str, writer);
13675 if (ret != 0)
13676 return ret;
13677 arg->sign = 1;
13678 break;
13679 }
13680
13681 case 'e':
13682 case 'E':
13683 case 'f':
13684 case 'F':
13685 case 'g':
13686 case 'G':
13687 if (arg->width == -1 && arg->prec == -1
13688 && !(arg->flags & (F_SIGN | F_BLANK)))
13689 {
13690 /* Fast path */
13691 if (formatfloat(v, arg, NULL, writer) == -1)
13692 return -1;
13693 return 1;
13694 }
13695
13696 arg->sign = 1;
13697 if (formatfloat(v, arg, p_str, NULL) == -1)
13698 return -1;
13699 break;
13700
13701 case 'c':
13702 {
13703 Py_UCS4 ch = formatchar(v);
13704 if (ch == (Py_UCS4) -1)
13705 return -1;
13706 if (arg->width == -1 && arg->prec == -1) {
13707 /* Fast path */
13708 if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1)
13709 return -1;
13710 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13711 writer->pos += 1;
13712 return 1;
13713 }
13714 *p_str = PyUnicode_FromOrdinal(ch);
13715 break;
13716 }
13717
13718 default:
13719 PyErr_Format(PyExc_ValueError,
13720 "unsupported format character '%c' (0x%x) "
13721 "at index %zd",
13722 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13723 (int)arg->ch,
13724 ctx->fmtpos - 1);
13725 return -1;
13726 }
13727 if (*p_str == NULL)
13728 return -1;
13729 assert (PyUnicode_Check(*p_str));
13730 return 0;
13731}
13732
13733static int
13734unicode_format_arg_output(struct unicode_formatter_t *ctx,
13735 struct unicode_format_arg_t *arg,
13736 PyObject *str)
13737{
13738 Py_ssize_t len;
13739 enum PyUnicode_Kind kind;
13740 void *pbuf;
13741 Py_ssize_t pindex;
13742 Py_UCS4 signchar;
13743 Py_ssize_t buflen;
13744 Py_UCS4 maxchar, bufmaxchar;
13745 Py_ssize_t sublen;
13746 _PyUnicodeWriter *writer = &ctx->writer;
13747 Py_UCS4 fill;
13748
13749 fill = ' ';
13750 if (arg->sign && arg->flags & F_ZERO)
13751 fill = '0';
13752
13753 if (PyUnicode_READY(str) == -1)
13754 return -1;
13755
13756 len = PyUnicode_GET_LENGTH(str);
13757 if ((arg->width == -1 || arg->width <= len)
13758 && (arg->prec == -1 || arg->prec >= len)
13759 && !(arg->flags & (F_SIGN | F_BLANK)))
13760 {
13761 /* Fast path */
13762 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13763 return -1;
13764 return 0;
13765 }
13766
13767 /* Truncate the string for "s", "r" and "a" formats
13768 if the precision is set */
13769 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13770 if (arg->prec >= 0 && len > arg->prec)
13771 len = arg->prec;
13772 }
13773
13774 /* Adjust sign and width */
13775 kind = PyUnicode_KIND(str);
13776 pbuf = PyUnicode_DATA(str);
13777 pindex = 0;
13778 signchar = '\0';
13779 if (arg->sign) {
13780 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13781 if (ch == '-' || ch == '+') {
13782 signchar = ch;
13783 len--;
13784 pindex++;
13785 }
13786 else if (arg->flags & F_SIGN)
13787 signchar = '+';
13788 else if (arg->flags & F_BLANK)
13789 signchar = ' ';
13790 else
13791 arg->sign = 0;
13792 }
13793 if (arg->width < len)
13794 arg->width = len;
13795
13796 /* Prepare the writer */
13797 bufmaxchar = 127;
13798 if (!(arg->flags & F_LJUST)) {
13799 if (arg->sign) {
13800 if ((arg->width-1) > len)
13801 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13802 }
13803 else {
13804 if (arg->width > len)
13805 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13806 }
13807 }
13808 maxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
13809 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
13810 buflen = arg->width;
13811 if (arg->sign && len == arg->width)
13812 buflen++;
13813 if (_PyUnicodeWriter_Prepare(writer, buflen, bufmaxchar) == -1)
13814 return -1;
13815
13816 /* Write the sign if needed */
13817 if (arg->sign) {
13818 if (fill != ' ') {
13819 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13820 writer->pos += 1;
13821 }
13822 if (arg->width > len)
13823 arg->width--;
13824 }
13825
13826 /* Write the numeric prefix for "x", "X" and "o" formats
13827 if the alternate form is used.
13828 For example, write "0x" for the "%#x" format. */
13829 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13830 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13831 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
13832 if (fill != ' ') {
13833 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13834 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13835 writer->pos += 2;
13836 pindex += 2;
13837 }
13838 arg->width -= 2;
13839 if (arg->width < 0)
13840 arg->width = 0;
13841 len -= 2;
13842 }
13843
13844 /* Pad left with the fill character if needed */
13845 if (arg->width > len && !(arg->flags & F_LJUST)) {
13846 sublen = arg->width - len;
13847 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
13848 writer->pos += sublen;
13849 arg->width = len;
13850 }
13851
13852 /* If padding with spaces: write sign if needed and/or numeric prefix if
13853 the alternate form is used */
13854 if (fill == ' ') {
13855 if (arg->sign) {
13856 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13857 writer->pos += 1;
13858 }
13859 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13860 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13861 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
13862 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13863 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13864 writer->pos += 2;
13865 pindex += 2;
13866 }
13867 }
13868
13869 /* Write characters */
13870 if (len) {
13871 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13872 str, pindex, len);
13873 writer->pos += len;
13874 }
13875
13876 /* Pad right with the fill character if needed */
13877 if (arg->width > len) {
13878 sublen = arg->width - len;
13879 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
13880 writer->pos += sublen;
13881 }
13882 return 0;
13883}
13884
13885/* Helper of PyUnicode_Format(): format one arg.
13886 Return 0 on success, raise an exception and return -1 on error. */
13887static int
13888unicode_format_arg(struct unicode_formatter_t *ctx)
13889{
13890 struct unicode_format_arg_t arg;
13891 PyObject *str;
13892 int ret;
13893
13894 ret = unicode_format_arg_parse(ctx, &arg);
13895 if (ret == -1)
13896 return -1;
13897
13898 ret = unicode_format_arg_format(ctx, &arg, &str);
13899 if (ret == -1)
13900 return -1;
13901
13902 if (ret != 1) {
13903 ret = unicode_format_arg_output(ctx, &arg, str);
13904 Py_DECREF(str);
13905 if (ret == -1)
13906 return -1;
13907 }
13908
13909 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
13910 PyErr_SetString(PyExc_TypeError,
13911 "not all arguments converted during string formatting");
13912 return -1;
13913 }
13914 return 0;
13915}
13916
Alexander Belopolsky40018472011-02-26 01:02:56 +000013917PyObject *
13918PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013919{
Victor Stinnera47082312012-10-04 02:19:54 +020013920 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000013921
Guido van Rossumd57fd912000-03-10 22:53:23 +000013922 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013923 PyErr_BadInternalCall();
13924 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013925 }
Victor Stinnera47082312012-10-04 02:19:54 +020013926
13927 ctx.fmtstr = PyUnicode_FromObject(format);
13928 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013929 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020013930 if (PyUnicode_READY(ctx.fmtstr) == -1) {
13931 Py_DECREF(ctx.fmtstr);
13932 return NULL;
13933 }
13934 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
13935 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
13936 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
13937 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013938
Victor Stinnera47082312012-10-04 02:19:54 +020013939 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013940
Guido van Rossumd57fd912000-03-10 22:53:23 +000013941 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020013942 ctx.arglen = PyTuple_Size(args);
13943 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013944 }
13945 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013946 ctx.arglen = -1;
13947 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013948 }
Victor Stinnera47082312012-10-04 02:19:54 +020013949 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013950 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020013951 ctx.dict = args;
13952 else
13953 ctx.dict = NULL;
13954 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013955
Victor Stinnera47082312012-10-04 02:19:54 +020013956 while (--ctx.fmtcnt >= 0) {
13957 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13958 Py_ssize_t nonfmtpos, sublen;
13959 Py_UCS4 maxchar;
13960
13961 nonfmtpos = ctx.fmtpos++;
13962 while (ctx.fmtcnt >= 0 &&
13963 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13964 ctx.fmtpos++;
13965 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013966 }
Victor Stinnera47082312012-10-04 02:19:54 +020013967 if (ctx.fmtcnt < 0) {
13968 ctx.fmtpos--;
13969 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020013970 }
Victor Stinnera47082312012-10-04 02:19:54 +020013971 sublen = ctx.fmtpos - nonfmtpos;
13972 maxchar = _PyUnicode_FindMaxChar(ctx.fmtstr,
Victor Stinneree4544c2012-05-09 22:24:08 +020013973 nonfmtpos, nonfmtpos + sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020013974 if (_PyUnicodeWriter_Prepare(&ctx.writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013975 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013976
Victor Stinnera47082312012-10-04 02:19:54 +020013977 _PyUnicode_FastCopyCharacters(ctx.writer.buffer, ctx.writer.pos,
13978 ctx.fmtstr, nonfmtpos, sublen);
13979 ctx.writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013980 }
13981 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013982 ctx.fmtpos++;
13983 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013984 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020013985 }
13986 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013987
Victor Stinnera47082312012-10-04 02:19:54 +020013988 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013989 PyErr_SetString(PyExc_TypeError,
13990 "not all arguments converted during string formatting");
13991 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013992 }
13993
Victor Stinnera47082312012-10-04 02:19:54 +020013994 if (ctx.args_owned) {
13995 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013996 }
Victor Stinnera47082312012-10-04 02:19:54 +020013997 Py_DECREF(ctx.fmtstr);
13998 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013999
Benjamin Peterson29060642009-01-31 22:14:21 +000014000 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014001 Py_DECREF(ctx.fmtstr);
14002 _PyUnicodeWriter_Dealloc(&ctx.writer);
14003 if (ctx.args_owned) {
14004 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014005 }
14006 return NULL;
14007}
14008
Jeremy Hylton938ace62002-07-17 16:30:39 +000014009static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014010unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14011
Tim Peters6d6c1a32001-08-02 04:15:00 +000014012static PyObject *
14013unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14014{
Benjamin Peterson29060642009-01-31 22:14:21 +000014015 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014016 static char *kwlist[] = {"object", "encoding", "errors", 0};
14017 char *encoding = NULL;
14018 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014019
Benjamin Peterson14339b62009-01-31 16:36:08 +000014020 if (type != &PyUnicode_Type)
14021 return unicode_subtype_new(type, args, kwds);
14022 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014023 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014024 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014025 if (x == NULL) {
14026 Py_INCREF(unicode_empty);
14027 return unicode_empty;
14028 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014029 if (encoding == NULL && errors == NULL)
14030 return PyObject_Str(x);
14031 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014032 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014033}
14034
Guido van Rossume023fe02001-08-30 03:12:59 +000014035static PyObject *
14036unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14037{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014038 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014039 Py_ssize_t length, char_size;
14040 int share_wstr, share_utf8;
14041 unsigned int kind;
14042 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014043
Benjamin Peterson14339b62009-01-31 16:36:08 +000014044 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014045
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014046 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014047 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014048 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014049 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014050 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014051 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014052 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014053 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014054
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014055 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014056 if (self == NULL) {
14057 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014058 return NULL;
14059 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014060 kind = PyUnicode_KIND(unicode);
14061 length = PyUnicode_GET_LENGTH(unicode);
14062
14063 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014064#ifdef Py_DEBUG
14065 _PyUnicode_HASH(self) = -1;
14066#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014067 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014068#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014069 _PyUnicode_STATE(self).interned = 0;
14070 _PyUnicode_STATE(self).kind = kind;
14071 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014072 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014073 _PyUnicode_STATE(self).ready = 1;
14074 _PyUnicode_WSTR(self) = NULL;
14075 _PyUnicode_UTF8_LENGTH(self) = 0;
14076 _PyUnicode_UTF8(self) = NULL;
14077 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014078 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014079
14080 share_utf8 = 0;
14081 share_wstr = 0;
14082 if (kind == PyUnicode_1BYTE_KIND) {
14083 char_size = 1;
14084 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14085 share_utf8 = 1;
14086 }
14087 else if (kind == PyUnicode_2BYTE_KIND) {
14088 char_size = 2;
14089 if (sizeof(wchar_t) == 2)
14090 share_wstr = 1;
14091 }
14092 else {
14093 assert(kind == PyUnicode_4BYTE_KIND);
14094 char_size = 4;
14095 if (sizeof(wchar_t) == 4)
14096 share_wstr = 1;
14097 }
14098
14099 /* Ensure we won't overflow the length. */
14100 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14101 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014102 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014103 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014104 data = PyObject_MALLOC((length + 1) * char_size);
14105 if (data == NULL) {
14106 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014107 goto onError;
14108 }
14109
Victor Stinnerc3c74152011-10-02 20:39:55 +020014110 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014111 if (share_utf8) {
14112 _PyUnicode_UTF8_LENGTH(self) = length;
14113 _PyUnicode_UTF8(self) = data;
14114 }
14115 if (share_wstr) {
14116 _PyUnicode_WSTR_LENGTH(self) = length;
14117 _PyUnicode_WSTR(self) = (wchar_t *)data;
14118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014119
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014120 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014121 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014122 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014123#ifdef Py_DEBUG
14124 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14125#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014126 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014127 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014128
14129onError:
14130 Py_DECREF(unicode);
14131 Py_DECREF(self);
14132 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014133}
14134
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014135PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014136"str(object='') -> str\n\
14137str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014138\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014139Create a new string object from the given object. If encoding or\n\
14140errors is specified, then the object must expose a data buffer\n\
14141that will be decoded using the given encoding and error handler.\n\
14142Otherwise, returns the result of object.__str__() (if defined)\n\
14143or repr(object).\n\
14144encoding defaults to sys.getdefaultencoding().\n\
14145errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014146
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014147static PyObject *unicode_iter(PyObject *seq);
14148
Guido van Rossumd57fd912000-03-10 22:53:23 +000014149PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014150 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014151 "str", /* tp_name */
14152 sizeof(PyUnicodeObject), /* tp_size */
14153 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014154 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014155 (destructor)unicode_dealloc, /* tp_dealloc */
14156 0, /* tp_print */
14157 0, /* tp_getattr */
14158 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014159 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014160 unicode_repr, /* tp_repr */
14161 &unicode_as_number, /* tp_as_number */
14162 &unicode_as_sequence, /* tp_as_sequence */
14163 &unicode_as_mapping, /* tp_as_mapping */
14164 (hashfunc) unicode_hash, /* tp_hash*/
14165 0, /* tp_call*/
14166 (reprfunc) unicode_str, /* tp_str */
14167 PyObject_GenericGetAttr, /* tp_getattro */
14168 0, /* tp_setattro */
14169 0, /* tp_as_buffer */
14170 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014171 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014172 unicode_doc, /* tp_doc */
14173 0, /* tp_traverse */
14174 0, /* tp_clear */
14175 PyUnicode_RichCompare, /* tp_richcompare */
14176 0, /* tp_weaklistoffset */
14177 unicode_iter, /* tp_iter */
14178 0, /* tp_iternext */
14179 unicode_methods, /* tp_methods */
14180 0, /* tp_members */
14181 0, /* tp_getset */
14182 &PyBaseObject_Type, /* tp_base */
14183 0, /* tp_dict */
14184 0, /* tp_descr_get */
14185 0, /* tp_descr_set */
14186 0, /* tp_dictoffset */
14187 0, /* tp_init */
14188 0, /* tp_alloc */
14189 unicode_new, /* tp_new */
14190 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014191};
14192
14193/* Initialize the Unicode implementation */
14194
Victor Stinner3a50e702011-10-18 21:21:00 +020014195int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014196{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014197 int i;
14198
Thomas Wouters477c8d52006-05-27 19:21:47 +000014199 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014200 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014201 0x000A, /* LINE FEED */
14202 0x000D, /* CARRIAGE RETURN */
14203 0x001C, /* FILE SEPARATOR */
14204 0x001D, /* GROUP SEPARATOR */
14205 0x001E, /* RECORD SEPARATOR */
14206 0x0085, /* NEXT LINE */
14207 0x2028, /* LINE SEPARATOR */
14208 0x2029, /* PARAGRAPH SEPARATOR */
14209 };
14210
Fred Drakee4315f52000-05-09 19:53:39 +000014211 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014212 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014213 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014214 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014215 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014216
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014217 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014218 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014219 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014220 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014221
14222 /* initialize the linebreak bloom filter */
14223 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014224 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014225 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014226
14227 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014228
14229#ifdef HAVE_MBCS
14230 winver.dwOSVersionInfoSize = sizeof(winver);
14231 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14232 PyErr_SetFromWindowsErr(0);
14233 return -1;
14234 }
14235#endif
14236 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014237}
14238
14239/* Finalize the Unicode implementation */
14240
Christian Heimesa156e092008-02-16 07:38:31 +000014241int
14242PyUnicode_ClearFreeList(void)
14243{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014244 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014245}
14246
Guido van Rossumd57fd912000-03-10 22:53:23 +000014247void
Thomas Wouters78890102000-07-22 19:25:51 +000014248_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014249{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014250 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014251
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014252 Py_XDECREF(unicode_empty);
14253 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014254
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014255 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014256 if (unicode_latin1[i]) {
14257 Py_DECREF(unicode_latin1[i]);
14258 unicode_latin1[i] = NULL;
14259 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014260 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014261 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014262 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014263}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014264
Walter Dörwald16807132007-05-25 13:52:07 +000014265void
14266PyUnicode_InternInPlace(PyObject **p)
14267{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014268 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014269 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014270#ifdef Py_DEBUG
14271 assert(s != NULL);
14272 assert(_PyUnicode_CHECK(s));
14273#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014274 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014275 return;
14276#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014277 /* If it's a subclass, we don't really know what putting
14278 it in the interned dict might do. */
14279 if (!PyUnicode_CheckExact(s))
14280 return;
14281 if (PyUnicode_CHECK_INTERNED(s))
14282 return;
14283 if (interned == NULL) {
14284 interned = PyDict_New();
14285 if (interned == NULL) {
14286 PyErr_Clear(); /* Don't leave an exception */
14287 return;
14288 }
14289 }
14290 /* It might be that the GetItem call fails even
14291 though the key is present in the dictionary,
14292 namely when this happens during a stack overflow. */
14293 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014294 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014295 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014296
Benjamin Peterson29060642009-01-31 22:14:21 +000014297 if (t) {
14298 Py_INCREF(t);
14299 Py_DECREF(*p);
14300 *p = t;
14301 return;
14302 }
Walter Dörwald16807132007-05-25 13:52:07 +000014303
Benjamin Peterson14339b62009-01-31 16:36:08 +000014304 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014305 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014306 PyErr_Clear();
14307 PyThreadState_GET()->recursion_critical = 0;
14308 return;
14309 }
14310 PyThreadState_GET()->recursion_critical = 0;
14311 /* The two references in interned are not counted by refcnt.
14312 The deallocator will take care of this */
14313 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014314 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014315}
14316
14317void
14318PyUnicode_InternImmortal(PyObject **p)
14319{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014320 PyUnicode_InternInPlace(p);
14321 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014322 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014323 Py_INCREF(*p);
14324 }
Walter Dörwald16807132007-05-25 13:52:07 +000014325}
14326
14327PyObject *
14328PyUnicode_InternFromString(const char *cp)
14329{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014330 PyObject *s = PyUnicode_FromString(cp);
14331 if (s == NULL)
14332 return NULL;
14333 PyUnicode_InternInPlace(&s);
14334 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014335}
14336
Alexander Belopolsky40018472011-02-26 01:02:56 +000014337void
14338_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014339{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014340 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014341 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014342 Py_ssize_t i, n;
14343 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014344
Benjamin Peterson14339b62009-01-31 16:36:08 +000014345 if (interned == NULL || !PyDict_Check(interned))
14346 return;
14347 keys = PyDict_Keys(interned);
14348 if (keys == NULL || !PyList_Check(keys)) {
14349 PyErr_Clear();
14350 return;
14351 }
Walter Dörwald16807132007-05-25 13:52:07 +000014352
Benjamin Peterson14339b62009-01-31 16:36:08 +000014353 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14354 detector, interned unicode strings are not forcibly deallocated;
14355 rather, we give them their stolen references back, and then clear
14356 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014357
Benjamin Peterson14339b62009-01-31 16:36:08 +000014358 n = PyList_GET_SIZE(keys);
14359 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014360 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014361 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014362 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014363 if (PyUnicode_READY(s) == -1) {
14364 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014365 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014367 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014368 case SSTATE_NOT_INTERNED:
14369 /* XXX Shouldn't happen */
14370 break;
14371 case SSTATE_INTERNED_IMMORTAL:
14372 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014373 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014374 break;
14375 case SSTATE_INTERNED_MORTAL:
14376 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014377 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014378 break;
14379 default:
14380 Py_FatalError("Inconsistent interned string state.");
14381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014382 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014383 }
14384 fprintf(stderr, "total size of all interned strings: "
14385 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14386 "mortal/immortal\n", mortal_size, immortal_size);
14387 Py_DECREF(keys);
14388 PyDict_Clear(interned);
14389 Py_DECREF(interned);
14390 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014391}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014392
14393
14394/********************* Unicode Iterator **************************/
14395
14396typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014397 PyObject_HEAD
14398 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014399 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014400} unicodeiterobject;
14401
14402static void
14403unicodeiter_dealloc(unicodeiterobject *it)
14404{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014405 _PyObject_GC_UNTRACK(it);
14406 Py_XDECREF(it->it_seq);
14407 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014408}
14409
14410static int
14411unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14412{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014413 Py_VISIT(it->it_seq);
14414 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014415}
14416
14417static PyObject *
14418unicodeiter_next(unicodeiterobject *it)
14419{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014420 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014421
Benjamin Peterson14339b62009-01-31 16:36:08 +000014422 assert(it != NULL);
14423 seq = it->it_seq;
14424 if (seq == NULL)
14425 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014426 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014428 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14429 int kind = PyUnicode_KIND(seq);
14430 void *data = PyUnicode_DATA(seq);
14431 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14432 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014433 if (item != NULL)
14434 ++it->it_index;
14435 return item;
14436 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014437
Benjamin Peterson14339b62009-01-31 16:36:08 +000014438 Py_DECREF(seq);
14439 it->it_seq = NULL;
14440 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014441}
14442
14443static PyObject *
14444unicodeiter_len(unicodeiterobject *it)
14445{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014446 Py_ssize_t len = 0;
14447 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014448 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014449 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014450}
14451
14452PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14453
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014454static PyObject *
14455unicodeiter_reduce(unicodeiterobject *it)
14456{
14457 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014458 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014459 it->it_seq, it->it_index);
14460 } else {
14461 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14462 if (u == NULL)
14463 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014464 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014465 }
14466}
14467
14468PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14469
14470static PyObject *
14471unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14472{
14473 Py_ssize_t index = PyLong_AsSsize_t(state);
14474 if (index == -1 && PyErr_Occurred())
14475 return NULL;
14476 if (index < 0)
14477 index = 0;
14478 it->it_index = index;
14479 Py_RETURN_NONE;
14480}
14481
14482PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14483
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014484static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014485 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014486 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014487 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14488 reduce_doc},
14489 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14490 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014491 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014492};
14493
14494PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014495 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14496 "str_iterator", /* tp_name */
14497 sizeof(unicodeiterobject), /* tp_basicsize */
14498 0, /* tp_itemsize */
14499 /* methods */
14500 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14501 0, /* tp_print */
14502 0, /* tp_getattr */
14503 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014504 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014505 0, /* tp_repr */
14506 0, /* tp_as_number */
14507 0, /* tp_as_sequence */
14508 0, /* tp_as_mapping */
14509 0, /* tp_hash */
14510 0, /* tp_call */
14511 0, /* tp_str */
14512 PyObject_GenericGetAttr, /* tp_getattro */
14513 0, /* tp_setattro */
14514 0, /* tp_as_buffer */
14515 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14516 0, /* tp_doc */
14517 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14518 0, /* tp_clear */
14519 0, /* tp_richcompare */
14520 0, /* tp_weaklistoffset */
14521 PyObject_SelfIter, /* tp_iter */
14522 (iternextfunc)unicodeiter_next, /* tp_iternext */
14523 unicodeiter_methods, /* tp_methods */
14524 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014525};
14526
14527static PyObject *
14528unicode_iter(PyObject *seq)
14529{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014530 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014531
Benjamin Peterson14339b62009-01-31 16:36:08 +000014532 if (!PyUnicode_Check(seq)) {
14533 PyErr_BadInternalCall();
14534 return NULL;
14535 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014536 if (PyUnicode_READY(seq) == -1)
14537 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014538 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14539 if (it == NULL)
14540 return NULL;
14541 it->it_index = 0;
14542 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014543 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014544 _PyObject_GC_TRACK(it);
14545 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014546}
14547
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014548
14549size_t
14550Py_UNICODE_strlen(const Py_UNICODE *u)
14551{
14552 int res = 0;
14553 while(*u++)
14554 res++;
14555 return res;
14556}
14557
14558Py_UNICODE*
14559Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14560{
14561 Py_UNICODE *u = s1;
14562 while ((*u++ = *s2++));
14563 return s1;
14564}
14565
14566Py_UNICODE*
14567Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14568{
14569 Py_UNICODE *u = s1;
14570 while ((*u++ = *s2++))
14571 if (n-- == 0)
14572 break;
14573 return s1;
14574}
14575
14576Py_UNICODE*
14577Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14578{
14579 Py_UNICODE *u1 = s1;
14580 u1 += Py_UNICODE_strlen(u1);
14581 Py_UNICODE_strcpy(u1, s2);
14582 return s1;
14583}
14584
14585int
14586Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14587{
14588 while (*s1 && *s2 && *s1 == *s2)
14589 s1++, s2++;
14590 if (*s1 && *s2)
14591 return (*s1 < *s2) ? -1 : +1;
14592 if (*s1)
14593 return 1;
14594 if (*s2)
14595 return -1;
14596 return 0;
14597}
14598
14599int
14600Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14601{
14602 register Py_UNICODE u1, u2;
14603 for (; n != 0; n--) {
14604 u1 = *s1;
14605 u2 = *s2;
14606 if (u1 != u2)
14607 return (u1 < u2) ? -1 : +1;
14608 if (u1 == '\0')
14609 return 0;
14610 s1++;
14611 s2++;
14612 }
14613 return 0;
14614}
14615
14616Py_UNICODE*
14617Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14618{
14619 const Py_UNICODE *p;
14620 for (p = s; *p; p++)
14621 if (*p == c)
14622 return (Py_UNICODE*)p;
14623 return NULL;
14624}
14625
14626Py_UNICODE*
14627Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14628{
14629 const Py_UNICODE *p;
14630 p = s + Py_UNICODE_strlen(s);
14631 while (p != s) {
14632 p--;
14633 if (*p == c)
14634 return (Py_UNICODE*)p;
14635 }
14636 return NULL;
14637}
Victor Stinner331ea922010-08-10 16:37:20 +000014638
Victor Stinner71133ff2010-09-01 23:43:53 +000014639Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014640PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014641{
Victor Stinner577db2c2011-10-11 22:12:48 +020014642 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014643 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014645 if (!PyUnicode_Check(unicode)) {
14646 PyErr_BadArgument();
14647 return NULL;
14648 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014649 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014650 if (u == NULL)
14651 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014652 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014653 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014654 PyErr_NoMemory();
14655 return NULL;
14656 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014657 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014658 size *= sizeof(Py_UNICODE);
14659 copy = PyMem_Malloc(size);
14660 if (copy == NULL) {
14661 PyErr_NoMemory();
14662 return NULL;
14663 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014664 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014665 return copy;
14666}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014667
Georg Brandl66c221e2010-10-14 07:04:07 +000014668/* A _string module, to export formatter_parser and formatter_field_name_split
14669 to the string.Formatter class implemented in Python. */
14670
14671static PyMethodDef _string_methods[] = {
14672 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14673 METH_O, PyDoc_STR("split the argument as a field name")},
14674 {"formatter_parser", (PyCFunction) formatter_parser,
14675 METH_O, PyDoc_STR("parse the argument as a format string")},
14676 {NULL, NULL}
14677};
14678
14679static struct PyModuleDef _string_module = {
14680 PyModuleDef_HEAD_INIT,
14681 "_string",
14682 PyDoc_STR("string helper module"),
14683 0,
14684 _string_methods,
14685 NULL,
14686 NULL,
14687 NULL,
14688 NULL
14689};
14690
14691PyMODINIT_FUNC
14692PyInit__string(void)
14693{
14694 return PyModule_Create(&_string_module);
14695}
14696
14697
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014698#ifdef __cplusplus
14699}
14700#endif