blob: 40e56cdced8d06b431c5ec1d63e8a4b76118f95b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinnere6abb482012-05-02 01:15:40 +0200114/* Optimized version of Py_MAX() to compute the maximum character:
115 use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2) \
117 ((maxchar1) | (maxchar2))
118
Victor Stinner910337b2011-10-03 03:20:16 +0200119#undef PyUnicode_READY
120#define PyUnicode_READY(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200123 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100124 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200162 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
182static PyObject *interned;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200185static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000186
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* Single character Unicode strings in the Latin-1 range are being
191 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200192static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193
Christian Heimes190d79e2008-01-30 11:58:22 +0000194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000199/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000200/* case 0x000C: * FORM FEED */
201/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 1, 1, 1, 1, 1, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x001C: * FILE SEPARATOR */
205/* case 0x001D: * GROUP SEPARATOR */
206/* case 0x001E: * RECORD SEPARATOR */
207/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 1, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000214
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000223};
224
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200225/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200227static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100228static int unicode_modifiable(PyObject *unicode);
229
Victor Stinnerfe226c02011-10-03 03:52:20 +0200230
Alexander Belopolsky40018472011-02-26 01:02:56 +0000231static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100241 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
Alexander Belopolsky40018472011-02-26 01:02:56 +0000244static void
245raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300246 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100247 PyObject *unicode,
248 Py_ssize_t startpos, Py_ssize_t endpos,
249 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000250
Christian Heimes190d79e2008-01-30 11:58:22 +0000251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000255/* 0x000B, * LINE TABULATION */
256/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000257/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000258 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x001C, * FILE SEPARATOR */
261/* 0x001D, * GROUP SEPARATOR */
262/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 1, 1, 1, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000277};
278
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000282PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000283{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000284#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 /* This is actually an illegal character, so it should
288 not be passed to unichr. */
289 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290#endif
291}
292
Victor Stinner910337b2011-10-03 03:20:16 +0200293#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200294int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200296{
297 PyASCIIObject *ascii;
298 unsigned int kind;
299
300 assert(PyUnicode_Check(op));
301
302 ascii = (PyASCIIObject *)op;
303 kind = ascii->state.kind;
304
Victor Stinnera3b334d2011-10-03 13:53:37 +0200305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200306 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(ascii->state.ready == 1);
308 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200309 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200311 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200312
Victor Stinnera41463c2011-10-04 01:05:08 +0200313 if (ascii->state.compact == 1) {
314 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND
316 || kind == PyUnicode_2BYTE_KIND
317 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200320 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100321 }
322 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 assert(ascii->length == 0);
328 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert(ascii->state.compact == 0);
330 assert(ascii->state.ascii == 0);
331 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100332 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->wstr != NULL);
334 assert(data == NULL);
335 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 }
337 else {
338 assert(kind == PyUnicode_1BYTE_KIND
339 || kind == PyUnicode_2BYTE_KIND
340 || kind == PyUnicode_4BYTE_KIND);
341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ready == 1);
343 assert(data != NULL);
344 if (ascii->state.ascii) {
345 assert (compact->utf8 == data);
346 assert (compact->utf8_length == ascii->length);
347 }
348 else
349 assert (compact->utf8 != data);
350 }
351 }
352 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200353 if (
354#if SIZEOF_WCHAR_T == 2
355 kind == PyUnicode_2BYTE_KIND
356#else
357 kind == PyUnicode_4BYTE_KIND
358#endif
359 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200360 {
361 assert(ascii->wstr == data);
362 assert(compact->wstr_length == ascii->length);
363 } else
364 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366
367 if (compact->utf8 == NULL)
368 assert(compact->utf8_length == 0);
369 if (ascii->wstr == NULL)
370 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200372 /* check that the best kind is used */
373 if (check_content && kind != PyUnicode_WCHAR_KIND)
374 {
375 Py_ssize_t i;
376 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200377 void *data;
378 Py_UCS4 ch;
379
380 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 for (i=0; i < ascii->length; i++)
382 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200383 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 if (ch > maxchar)
385 maxchar = ch;
386 }
387 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100388 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200389 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100390 assert(maxchar <= 255);
391 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 else
393 assert(maxchar < 128);
394 }
Victor Stinner77faf692011-11-20 18:56:05 +0100395 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100397 assert(maxchar <= 0xFFFF);
398 }
399 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100401 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100402 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200403 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400405 return 1;
406}
Victor Stinner910337b2011-10-03 03:20:16 +0200407#endif
408
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100409static PyObject*
410unicode_result_wchar(PyObject *unicode)
411{
412#ifndef Py_DEBUG
413 Py_ssize_t len;
414
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100415 len = _PyUnicode_WSTR_LENGTH(unicode);
416 if (len == 0) {
417 Py_INCREF(unicode_empty);
418 Py_DECREF(unicode);
419 return unicode_empty;
420 }
421
422 if (len == 1) {
423 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
424 if (ch < 256) {
425 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
426 Py_DECREF(unicode);
427 return latin1_char;
428 }
429 }
430
431 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200432 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 return NULL;
434 }
435#else
Victor Stinneraa771272012-10-04 02:32:58 +0200436 assert(Py_REFCNT(unicode) == 1);
437
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100438 /* don't make the result ready in debug mode to ensure that the caller
439 makes the string ready before using it */
440 assert(_PyUnicode_CheckConsistency(unicode, 1));
441#endif
442 return unicode;
443}
444
445static PyObject*
446unicode_result_ready(PyObject *unicode)
447{
448 Py_ssize_t length;
449
450 length = PyUnicode_GET_LENGTH(unicode);
451 if (length == 0) {
452 if (unicode != unicode_empty) {
453 Py_INCREF(unicode_empty);
454 Py_DECREF(unicode);
455 }
456 return unicode_empty;
457 }
458
459 if (length == 1) {
460 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
461 if (ch < 256) {
462 PyObject *latin1_char = unicode_latin1[ch];
463 if (latin1_char != NULL) {
464 if (unicode != latin1_char) {
465 Py_INCREF(latin1_char);
466 Py_DECREF(unicode);
467 }
468 return latin1_char;
469 }
470 else {
471 assert(_PyUnicode_CheckConsistency(unicode, 1));
472 Py_INCREF(unicode);
473 unicode_latin1[ch] = unicode;
474 return unicode;
475 }
476 }
477 }
478
479 assert(_PyUnicode_CheckConsistency(unicode, 1));
480 return unicode;
481}
482
483static PyObject*
484unicode_result(PyObject *unicode)
485{
486 assert(_PyUnicode_CHECK(unicode));
487 if (PyUnicode_IS_READY(unicode))
488 return unicode_result_ready(unicode);
489 else
490 return unicode_result_wchar(unicode);
491}
492
Victor Stinnerc4b49542011-12-11 22:44:26 +0100493static PyObject*
494unicode_result_unchanged(PyObject *unicode)
495{
496 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500497 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100498 return NULL;
499 Py_INCREF(unicode);
500 return unicode;
501 }
502 else
503 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100504 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100505}
506
Victor Stinner3a50e702011-10-18 21:21:00 +0200507#ifdef HAVE_MBCS
508static OSVERSIONINFOEX winver;
509#endif
510
Thomas Wouters477c8d52006-05-27 19:21:47 +0000511/* --- Bloom Filters ----------------------------------------------------- */
512
513/* stuff to implement simple "bloom filters" for Unicode characters.
514 to keep things simple, we use a single bitmask, using the least 5
515 bits from each unicode characters as the bit index. */
516
517/* the linebreak mask is set up by Unicode_Init below */
518
Antoine Pitrouf068f942010-01-13 14:19:12 +0000519#if LONG_BIT >= 128
520#define BLOOM_WIDTH 128
521#elif LONG_BIT >= 64
522#define BLOOM_WIDTH 64
523#elif LONG_BIT >= 32
524#define BLOOM_WIDTH 32
525#else
526#error "LONG_BIT is smaller than 32"
527#endif
528
Thomas Wouters477c8d52006-05-27 19:21:47 +0000529#define BLOOM_MASK unsigned long
530
531static BLOOM_MASK bloom_linebreak;
532
Antoine Pitrouf068f942010-01-13 14:19:12 +0000533#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
534#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536#define BLOOM_LINEBREAK(ch) \
537 ((ch) < 128U ? ascii_linebreak[(ch)] : \
538 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539
Alexander Belopolsky40018472011-02-26 01:02:56 +0000540Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200541make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542{
543 /* calculate simple bloom-style bitmask for a given unicode string */
544
Antoine Pitrouf068f942010-01-13 14:19:12 +0000545 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546 Py_ssize_t i;
547
548 mask = 0;
549 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
552 return mask;
553}
554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555#define BLOOM_MEMBER(mask, chr, str) \
556 (BLOOM(mask, chr) \
557 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000558
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200559/* Compilation of templated routines */
560
561#include "stringlib/asciilib.h"
562#include "stringlib/fastsearch.h"
563#include "stringlib/partition.h"
564#include "stringlib/split.h"
565#include "stringlib/count.h"
566#include "stringlib/find.h"
567#include "stringlib/find_max_char.h"
568#include "stringlib/localeutil.h"
569#include "stringlib/undef.h"
570
571#include "stringlib/ucs1lib.h"
572#include "stringlib/fastsearch.h"
573#include "stringlib/partition.h"
574#include "stringlib/split.h"
575#include "stringlib/count.h"
576#include "stringlib/find.h"
577#include "stringlib/find_max_char.h"
578#include "stringlib/localeutil.h"
579#include "stringlib/undef.h"
580
581#include "stringlib/ucs2lib.h"
582#include "stringlib/fastsearch.h"
583#include "stringlib/partition.h"
584#include "stringlib/split.h"
585#include "stringlib/count.h"
586#include "stringlib/find.h"
587#include "stringlib/find_max_char.h"
588#include "stringlib/localeutil.h"
589#include "stringlib/undef.h"
590
591#include "stringlib/ucs4lib.h"
592#include "stringlib/fastsearch.h"
593#include "stringlib/partition.h"
594#include "stringlib/split.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
597#include "stringlib/find_max_char.h"
598#include "stringlib/localeutil.h"
599#include "stringlib/undef.h"
600
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200601#include "stringlib/unicodedefs.h"
602#include "stringlib/fastsearch.h"
603#include "stringlib/count.h"
604#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100605#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200606
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607/* --- Unicode Object ----------------------------------------------------- */
608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200609static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200610fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
613 Py_ssize_t size, Py_UCS4 ch,
614 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200616 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
617
618 switch (kind) {
619 case PyUnicode_1BYTE_KIND:
620 {
621 Py_UCS1 ch1 = (Py_UCS1) ch;
622 if (ch1 == ch)
623 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
624 else
625 return -1;
626 }
627 case PyUnicode_2BYTE_KIND:
628 {
629 Py_UCS2 ch2 = (Py_UCS2) ch;
630 if (ch2 == ch)
631 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
632 else
633 return -1;
634 }
635 case PyUnicode_4BYTE_KIND:
636 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
637 default:
638 assert(0);
639 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641}
642
Victor Stinnerafffce42012-10-03 23:03:17 +0200643#ifdef Py_DEBUG
644/* Fill the data of an Unicode string with invalid characters to detect bugs
645 earlier.
646
647 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
648 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
649 invalid character in Unicode 6.0. */
650static void
651unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
652{
653 int kind = PyUnicode_KIND(unicode);
654 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
655 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
656 if (length <= old_length)
657 return;
658 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
659}
660#endif
661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662static PyObject*
663resize_compact(PyObject *unicode, Py_ssize_t length)
664{
665 Py_ssize_t char_size;
666 Py_ssize_t struct_size;
667 Py_ssize_t new_size;
668 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100669 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200670#ifdef Py_DEBUG
671 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
672#endif
673
Victor Stinner79891572012-05-03 13:43:07 +0200674 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200675 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100676 assert(PyUnicode_IS_COMPACT(unicode));
677
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200678 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100679 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680 struct_size = sizeof(PyASCIIObject);
681 else
682 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200683 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684
Victor Stinnerfe226c02011-10-03 03:52:20 +0200685 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
686 PyErr_NoMemory();
687 return NULL;
688 }
689 new_size = (struct_size + (length + 1) * char_size);
690
Victor Stinner84def372011-12-11 20:04:56 +0100691 _Py_DEC_REFTOTAL;
692 _Py_ForgetReference(unicode);
693
694 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
695 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100696 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 PyErr_NoMemory();
698 return NULL;
699 }
Victor Stinner84def372011-12-11 20:04:56 +0100700 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200701 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100702
Victor Stinnerfe226c02011-10-03 03:52:20 +0200703 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200704 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200705 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100706 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200707 _PyUnicode_WSTR_LENGTH(unicode) = length;
708 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200709#ifdef Py_DEBUG
710 unicode_fill_invalid(unicode, old_length);
711#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200712 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
713 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200714 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715 return unicode;
716}
717
Alexander Belopolsky40018472011-02-26 01:02:56 +0000718static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200719resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000720{
Victor Stinner95663112011-10-04 01:03:50 +0200721 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100722 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200723 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200724 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000725
Victor Stinnerfe226c02011-10-03 03:52:20 +0200726 if (PyUnicode_IS_READY(unicode)) {
727 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200728 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200729 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200730#ifdef Py_DEBUG
731 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
732#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200733
734 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200735 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200736 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
737 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738
739 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
740 PyErr_NoMemory();
741 return -1;
742 }
743 new_size = (length + 1) * char_size;
744
Victor Stinner7a9105a2011-12-12 00:13:42 +0100745 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
746 {
747 PyObject_DEL(_PyUnicode_UTF8(unicode));
748 _PyUnicode_UTF8(unicode) = NULL;
749 _PyUnicode_UTF8_LENGTH(unicode) = 0;
750 }
751
Victor Stinnerfe226c02011-10-03 03:52:20 +0200752 data = (PyObject *)PyObject_REALLOC(data, new_size);
753 if (data == NULL) {
754 PyErr_NoMemory();
755 return -1;
756 }
757 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200758 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200760 _PyUnicode_WSTR_LENGTH(unicode) = length;
761 }
762 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200763 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200764 _PyUnicode_UTF8_LENGTH(unicode) = length;
765 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 _PyUnicode_LENGTH(unicode) = length;
767 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200768#ifdef Py_DEBUG
769 unicode_fill_invalid(unicode, old_length);
770#endif
Victor Stinner95663112011-10-04 01:03:50 +0200771 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200772 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200775 }
Victor Stinner95663112011-10-04 01:03:50 +0200776 assert(_PyUnicode_WSTR(unicode) != NULL);
777
778 /* check for integer overflow */
779 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
780 PyErr_NoMemory();
781 return -1;
782 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100783 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200784 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200786 if (!wstr) {
787 PyErr_NoMemory();
788 return -1;
789 }
790 _PyUnicode_WSTR(unicode) = wstr;
791 _PyUnicode_WSTR(unicode)[length] = 0;
792 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200793 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000794 return 0;
795}
796
Victor Stinnerfe226c02011-10-03 03:52:20 +0200797static PyObject*
798resize_copy(PyObject *unicode, Py_ssize_t length)
799{
800 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100801 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200802 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100803
Benjamin Petersonbac79492012-01-14 13:34:47 -0500804 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100805 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200806
807 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
808 if (copy == NULL)
809 return NULL;
810
811 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200812 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200814 }
815 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200816 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100817
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200818 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200819 if (w == NULL)
820 return NULL;
821 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
822 copy_length = Py_MIN(copy_length, length);
823 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
824 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200825 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200826 }
827}
828
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000830 Ux0000 terminated; some code (e.g. new_identifier)
831 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000832
833 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000834 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000835
836*/
837
Alexander Belopolsky40018472011-02-26 01:02:56 +0000838static PyUnicodeObject *
839_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840{
841 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000843
Thomas Wouters477c8d52006-05-27 19:21:47 +0000844 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000845 if (length == 0 && unicode_empty != NULL) {
846 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200847 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000848 }
849
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000850 /* Ensure we won't overflow the size. */
851 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
852 return (PyUnicodeObject *)PyErr_NoMemory();
853 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200854 if (length < 0) {
855 PyErr_SetString(PyExc_SystemError,
856 "Negative size passed to _PyUnicode_New");
857 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000858 }
859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200860 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
861 if (unicode == NULL)
862 return NULL;
863 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
864 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
865 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100866 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000867 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100868 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000869 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200870
Jeremy Hyltond8082792003-09-16 19:41:39 +0000871 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000872 * the caller fails before initializing str -- unicode_resize()
873 * reads str[0], and the Keep-Alive optimization can keep memory
874 * allocated for str alive across a call to unicode_dealloc(unicode).
875 * We don't want unicode_resize to read uninitialized memory in
876 * that case.
877 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200878 _PyUnicode_WSTR(unicode)[0] = 0;
879 _PyUnicode_WSTR(unicode)[length] = 0;
880 _PyUnicode_WSTR_LENGTH(unicode) = length;
881 _PyUnicode_HASH(unicode) = -1;
882 _PyUnicode_STATE(unicode).interned = 0;
883 _PyUnicode_STATE(unicode).kind = 0;
884 _PyUnicode_STATE(unicode).compact = 0;
885 _PyUnicode_STATE(unicode).ready = 0;
886 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200887 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200888 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200889 _PyUnicode_UTF8(unicode) = NULL;
890 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100891 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000892 return unicode;
893}
894
Victor Stinnerf42dc442011-10-02 23:33:16 +0200895static const char*
896unicode_kind_name(PyObject *unicode)
897{
Victor Stinner42dfd712011-10-03 14:41:45 +0200898 /* don't check consistency: unicode_kind_name() is called from
899 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200900 if (!PyUnicode_IS_COMPACT(unicode))
901 {
902 if (!PyUnicode_IS_READY(unicode))
903 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600904 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200905 {
906 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200907 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200908 return "legacy ascii";
909 else
910 return "legacy latin1";
911 case PyUnicode_2BYTE_KIND:
912 return "legacy UCS2";
913 case PyUnicode_4BYTE_KIND:
914 return "legacy UCS4";
915 default:
916 return "<legacy invalid kind>";
917 }
918 }
919 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600920 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200921 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200922 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200923 return "ascii";
924 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200925 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200926 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200927 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200928 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200929 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200930 default:
931 return "<invalid compact kind>";
932 }
933}
934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200935#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200936/* Functions wrapping macros for use in debugger */
937char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200938 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200939}
940
941void *_PyUnicode_compact_data(void *unicode) {
942 return _PyUnicode_COMPACT_DATA(unicode);
943}
944void *_PyUnicode_data(void *unicode){
945 printf("obj %p\n", unicode);
946 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
947 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
948 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
949 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
950 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
951 return PyUnicode_DATA(unicode);
952}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200953
954void
955_PyUnicode_Dump(PyObject *op)
956{
957 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200958 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
959 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
960 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200961
Victor Stinnera849a4b2011-10-03 12:12:11 +0200962 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200963 {
964 if (ascii->state.ascii)
965 data = (ascii + 1);
966 else
967 data = (compact + 1);
968 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200969 else
970 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200971 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
972
Victor Stinnera849a4b2011-10-03 12:12:11 +0200973 if (ascii->wstr == data)
974 printf("shared ");
975 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200976
Victor Stinnera3b334d2011-10-03 13:53:37 +0200977 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200978 printf(" (%zu), ", compact->wstr_length);
979 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
980 printf("shared ");
981 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200982 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200983 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200984}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985#endif
986
987PyObject *
988PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
989{
990 PyObject *obj;
991 PyCompactUnicodeObject *unicode;
992 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200993 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200994 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995 Py_ssize_t char_size;
996 Py_ssize_t struct_size;
997
998 /* Optimization for empty strings */
999 if (size == 0 && unicode_empty != NULL) {
1000 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001001 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001002 }
1003
Victor Stinner9e9d6892011-10-04 01:02:02 +02001004 is_ascii = 0;
1005 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001006 struct_size = sizeof(PyCompactUnicodeObject);
1007 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001008 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009 char_size = 1;
1010 is_ascii = 1;
1011 struct_size = sizeof(PyASCIIObject);
1012 }
1013 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001014 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001015 char_size = 1;
1016 }
1017 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001018 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001019 char_size = 2;
1020 if (sizeof(wchar_t) == 2)
1021 is_sharing = 1;
1022 }
1023 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001024 if (maxchar > MAX_UNICODE) {
1025 PyErr_SetString(PyExc_SystemError,
1026 "invalid maximum character passed to PyUnicode_New");
1027 return NULL;
1028 }
Victor Stinner8f825062012-04-27 13:55:39 +02001029 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030 char_size = 4;
1031 if (sizeof(wchar_t) == 4)
1032 is_sharing = 1;
1033 }
1034
1035 /* Ensure we won't overflow the size. */
1036 if (size < 0) {
1037 PyErr_SetString(PyExc_SystemError,
1038 "Negative size passed to PyUnicode_New");
1039 return NULL;
1040 }
1041 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1042 return PyErr_NoMemory();
1043
1044 /* Duplicated allocation code from _PyObject_New() instead of a call to
1045 * PyObject_New() so we are able to allocate space for the object and
1046 * it's data buffer.
1047 */
1048 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1049 if (obj == NULL)
1050 return PyErr_NoMemory();
1051 obj = PyObject_INIT(obj, &PyUnicode_Type);
1052 if (obj == NULL)
1053 return NULL;
1054
1055 unicode = (PyCompactUnicodeObject *)obj;
1056 if (is_ascii)
1057 data = ((PyASCIIObject*)obj) + 1;
1058 else
1059 data = unicode + 1;
1060 _PyUnicode_LENGTH(unicode) = size;
1061 _PyUnicode_HASH(unicode) = -1;
1062 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001063 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 _PyUnicode_STATE(unicode).compact = 1;
1065 _PyUnicode_STATE(unicode).ready = 1;
1066 _PyUnicode_STATE(unicode).ascii = is_ascii;
1067 if (is_ascii) {
1068 ((char*)data)[size] = 0;
1069 _PyUnicode_WSTR(unicode) = NULL;
1070 }
Victor Stinner8f825062012-04-27 13:55:39 +02001071 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 ((char*)data)[size] = 0;
1073 _PyUnicode_WSTR(unicode) = NULL;
1074 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001076 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078 else {
1079 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001080 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001081 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001083 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001084 ((Py_UCS4*)data)[size] = 0;
1085 if (is_sharing) {
1086 _PyUnicode_WSTR_LENGTH(unicode) = size;
1087 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1088 }
1089 else {
1090 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1091 _PyUnicode_WSTR(unicode) = NULL;
1092 }
1093 }
Victor Stinner8f825062012-04-27 13:55:39 +02001094#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001095 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001096#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001097 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001098 return obj;
1099}
1100
1101#if SIZEOF_WCHAR_T == 2
1102/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1103 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001104 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105
1106 This function assumes that unicode can hold one more code point than wstr
1107 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001108static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001110 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111{
1112 const wchar_t *iter;
1113 Py_UCS4 *ucs4_out;
1114
Victor Stinner910337b2011-10-03 03:20:16 +02001115 assert(unicode != NULL);
1116 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1118 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1119
1120 for (iter = begin; iter < end; ) {
1121 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1122 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001123 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1124 && (iter+1) < end
1125 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126 {
Victor Stinner551ac952011-11-29 22:58:13 +01001127 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128 iter += 2;
1129 }
1130 else {
1131 *ucs4_out++ = *iter;
1132 iter++;
1133 }
1134 }
1135 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1136 _PyUnicode_GET_LENGTH(unicode)));
1137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138}
1139#endif
1140
Victor Stinnercd9950f2011-10-02 00:34:53 +02001141static int
Victor Stinner488fa492011-12-12 00:01:39 +01001142unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001143{
Victor Stinner488fa492011-12-12 00:01:39 +01001144 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001145 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001146 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001147 return -1;
1148 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001149 return 0;
1150}
1151
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152static int
1153_copy_characters(PyObject *to, Py_ssize_t to_start,
1154 PyObject *from, Py_ssize_t from_start,
1155 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001157 unsigned int from_kind, to_kind;
1158 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159
Victor Stinneree4544c2012-05-09 22:24:08 +02001160 assert(0 <= how_many);
1161 assert(0 <= from_start);
1162 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001163 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001164 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001165 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166
Victor Stinnerd3f08822012-05-29 12:57:52 +02001167 assert(PyUnicode_Check(to));
1168 assert(PyUnicode_IS_READY(to));
1169 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1170
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001171 if (how_many == 0)
1172 return 0;
1173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001174 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001175 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001177 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178
Victor Stinnerf1852262012-06-16 16:38:26 +02001179#ifdef Py_DEBUG
1180 if (!check_maxchar
1181 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1182 {
1183 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1184 Py_UCS4 ch;
1185 Py_ssize_t i;
1186 for (i=0; i < how_many; i++) {
1187 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1188 assert(ch <= to_maxchar);
1189 }
1190 }
1191#endif
1192
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001193 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001194 if (check_maxchar
1195 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1196 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001197 /* Writing Latin-1 characters into an ASCII string requires to
1198 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001199 Py_UCS4 max_char;
1200 max_char = ucs1lib_find_max_char(from_data,
1201 (Py_UCS1*)from_data + how_many);
1202 if (max_char >= 128)
1203 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001204 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001205 Py_MEMCPY((char*)to_data + to_kind * to_start,
1206 (char*)from_data + from_kind * from_start,
1207 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001209 else if (from_kind == PyUnicode_1BYTE_KIND
1210 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001211 {
1212 _PyUnicode_CONVERT_BYTES(
1213 Py_UCS1, Py_UCS2,
1214 PyUnicode_1BYTE_DATA(from) + from_start,
1215 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1216 PyUnicode_2BYTE_DATA(to) + to_start
1217 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001218 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001219 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001220 && to_kind == PyUnicode_4BYTE_KIND)
1221 {
1222 _PyUnicode_CONVERT_BYTES(
1223 Py_UCS1, Py_UCS4,
1224 PyUnicode_1BYTE_DATA(from) + from_start,
1225 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1226 PyUnicode_4BYTE_DATA(to) + to_start
1227 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001228 }
1229 else if (from_kind == PyUnicode_2BYTE_KIND
1230 && to_kind == PyUnicode_4BYTE_KIND)
1231 {
1232 _PyUnicode_CONVERT_BYTES(
1233 Py_UCS2, Py_UCS4,
1234 PyUnicode_2BYTE_DATA(from) + from_start,
1235 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1236 PyUnicode_4BYTE_DATA(to) + to_start
1237 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001238 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001239 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001240 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1241
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001242 if (!check_maxchar) {
1243 if (from_kind == PyUnicode_2BYTE_KIND
1244 && to_kind == PyUnicode_1BYTE_KIND)
1245 {
1246 _PyUnicode_CONVERT_BYTES(
1247 Py_UCS2, Py_UCS1,
1248 PyUnicode_2BYTE_DATA(from) + from_start,
1249 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1250 PyUnicode_1BYTE_DATA(to) + to_start
1251 );
1252 }
1253 else if (from_kind == PyUnicode_4BYTE_KIND
1254 && to_kind == PyUnicode_1BYTE_KIND)
1255 {
1256 _PyUnicode_CONVERT_BYTES(
1257 Py_UCS4, Py_UCS1,
1258 PyUnicode_4BYTE_DATA(from) + from_start,
1259 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1260 PyUnicode_1BYTE_DATA(to) + to_start
1261 );
1262 }
1263 else if (from_kind == PyUnicode_4BYTE_KIND
1264 && to_kind == PyUnicode_2BYTE_KIND)
1265 {
1266 _PyUnicode_CONVERT_BYTES(
1267 Py_UCS4, Py_UCS2,
1268 PyUnicode_4BYTE_DATA(from) + from_start,
1269 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1270 PyUnicode_2BYTE_DATA(to) + to_start
1271 );
1272 }
1273 else {
1274 assert(0);
1275 return -1;
1276 }
1277 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001278 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001279 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001280 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001281 Py_ssize_t i;
1282
Victor Stinnera0702ab2011-09-29 14:14:38 +02001283 for (i=0; i < how_many; i++) {
1284 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 if (ch > to_maxchar)
1286 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001287 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1288 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001289 }
1290 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001291 return 0;
1292}
1293
Victor Stinnerd3f08822012-05-29 12:57:52 +02001294void
1295_PyUnicode_FastCopyCharacters(
1296 PyObject *to, Py_ssize_t to_start,
1297 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001298{
1299 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1300}
1301
1302Py_ssize_t
1303PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1304 PyObject *from, Py_ssize_t from_start,
1305 Py_ssize_t how_many)
1306{
1307 int err;
1308
1309 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1310 PyErr_BadInternalCall();
1311 return -1;
1312 }
1313
Benjamin Petersonbac79492012-01-14 13:34:47 -05001314 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001315 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001316 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001317 return -1;
1318
Victor Stinnerd3f08822012-05-29 12:57:52 +02001319 if (from_start < 0) {
1320 PyErr_SetString(PyExc_IndexError, "string index out of range");
1321 return -1;
1322 }
1323 if (to_start < 0) {
1324 PyErr_SetString(PyExc_IndexError, "string index out of range");
1325 return -1;
1326 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001327 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1328 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1329 PyErr_Format(PyExc_SystemError,
1330 "Cannot write %zi characters at %zi "
1331 "in a string of %zi characters",
1332 how_many, to_start, PyUnicode_GET_LENGTH(to));
1333 return -1;
1334 }
1335
1336 if (how_many == 0)
1337 return 0;
1338
Victor Stinner488fa492011-12-12 00:01:39 +01001339 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001340 return -1;
1341
1342 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1343 if (err) {
1344 PyErr_Format(PyExc_SystemError,
1345 "Cannot copy %s characters "
1346 "into a string of %s characters",
1347 unicode_kind_name(from),
1348 unicode_kind_name(to));
1349 return -1;
1350 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001351 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001352}
1353
Victor Stinner17222162011-09-28 22:15:37 +02001354/* Find the maximum code point and count the number of surrogate pairs so a
1355 correct string length can be computed before converting a string to UCS4.
1356 This function counts single surrogates as a character and not as a pair.
1357
1358 Return 0 on success, or -1 on error. */
1359static int
1360find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1361 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362{
1363 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001364 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365
Victor Stinnerc53be962011-10-02 21:33:54 +02001366 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 *num_surrogates = 0;
1368 *maxchar = 0;
1369
1370 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001372 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1373 && (iter+1) < end
1374 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001376 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378 iter += 2;
1379 }
1380 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001382 {
1383 ch = *iter;
1384 iter++;
1385 }
1386 if (ch > *maxchar) {
1387 *maxchar = ch;
1388 if (*maxchar > MAX_UNICODE) {
1389 PyErr_Format(PyExc_ValueError,
1390 "character U+%x is not in range [U+0000; U+10ffff]",
1391 ch);
1392 return -1;
1393 }
1394 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 }
1396 return 0;
1397}
1398
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001399int
1400_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401{
1402 wchar_t *end;
1403 Py_UCS4 maxchar = 0;
1404 Py_ssize_t num_surrogates;
1405#if SIZEOF_WCHAR_T == 2
1406 Py_ssize_t length_wo_surrogates;
1407#endif
1408
Georg Brandl7597add2011-10-05 16:36:47 +02001409 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001410 strings were created using _PyObject_New() and where no canonical
1411 representation (the str field) has been set yet aka strings
1412 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001413 assert(_PyUnicode_CHECK(unicode));
1414 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001416 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001417 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001418 /* Actually, it should neither be interned nor be anything else: */
1419 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001422 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001423 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425
1426 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001427 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1428 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429 PyErr_NoMemory();
1430 return -1;
1431 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001432 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 _PyUnicode_WSTR(unicode), end,
1434 PyUnicode_1BYTE_DATA(unicode));
1435 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1436 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1437 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1438 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001439 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001440 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001441 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 }
1443 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001444 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001445 _PyUnicode_UTF8(unicode) = NULL;
1446 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 }
1448 PyObject_FREE(_PyUnicode_WSTR(unicode));
1449 _PyUnicode_WSTR(unicode) = NULL;
1450 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1451 }
1452 /* In this case we might have to convert down from 4-byte native
1453 wchar_t to 2-byte unicode. */
1454 else if (maxchar < 65536) {
1455 assert(num_surrogates == 0 &&
1456 "FindMaxCharAndNumSurrogatePairs() messed up");
1457
Victor Stinner506f5922011-09-28 22:34:18 +02001458#if SIZEOF_WCHAR_T == 2
1459 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001460 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001461 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1462 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1463 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001464 _PyUnicode_UTF8(unicode) = NULL;
1465 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001466#else
1467 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001468 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001469 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001471 PyErr_NoMemory();
1472 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 }
Victor Stinner506f5922011-09-28 22:34:18 +02001474 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1475 _PyUnicode_WSTR(unicode), end,
1476 PyUnicode_2BYTE_DATA(unicode));
1477 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1478 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1479 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001480 _PyUnicode_UTF8(unicode) = NULL;
1481 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001482 PyObject_FREE(_PyUnicode_WSTR(unicode));
1483 _PyUnicode_WSTR(unicode) = NULL;
1484 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1485#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486 }
1487 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1488 else {
1489#if SIZEOF_WCHAR_T == 2
1490 /* in case the native representation is 2-bytes, we need to allocate a
1491 new normalized 4-byte version. */
1492 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001493 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1494 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495 PyErr_NoMemory();
1496 return -1;
1497 }
1498 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1499 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001500 _PyUnicode_UTF8(unicode) = NULL;
1501 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001502 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1503 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001504 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001505 PyObject_FREE(_PyUnicode_WSTR(unicode));
1506 _PyUnicode_WSTR(unicode) = NULL;
1507 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1508#else
1509 assert(num_surrogates == 0);
1510
Victor Stinnerc3c74152011-10-02 20:39:55 +02001511 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001512 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001513 _PyUnicode_UTF8(unicode) = NULL;
1514 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1516#endif
1517 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1518 }
1519 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001520 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521 return 0;
1522}
1523
Alexander Belopolsky40018472011-02-26 01:02:56 +00001524static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001525unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526{
Walter Dörwald16807132007-05-25 13:52:07 +00001527 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001528 case SSTATE_NOT_INTERNED:
1529 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001530
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 case SSTATE_INTERNED_MORTAL:
1532 /* revive dead object temporarily for DelItem */
1533 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001534 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001535 Py_FatalError(
1536 "deletion of interned string failed");
1537 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001538
Benjamin Peterson29060642009-01-31 22:14:21 +00001539 case SSTATE_INTERNED_IMMORTAL:
1540 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001541
Benjamin Peterson29060642009-01-31 22:14:21 +00001542 default:
1543 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001544 }
1545
Victor Stinner03490912011-10-03 23:45:12 +02001546 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001548 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001549 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001550 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1551 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001552
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001553 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554}
1555
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001556#ifdef Py_DEBUG
1557static int
1558unicode_is_singleton(PyObject *unicode)
1559{
1560 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1561 if (unicode == unicode_empty)
1562 return 1;
1563 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1564 {
1565 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1566 if (ch < 256 && unicode_latin1[ch] == unicode)
1567 return 1;
1568 }
1569 return 0;
1570}
1571#endif
1572
Alexander Belopolsky40018472011-02-26 01:02:56 +00001573static int
Victor Stinner488fa492011-12-12 00:01:39 +01001574unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001575{
Victor Stinner488fa492011-12-12 00:01:39 +01001576 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001577 if (Py_REFCNT(unicode) != 1)
1578 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001579 if (_PyUnicode_HASH(unicode) != -1)
1580 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 if (PyUnicode_CHECK_INTERNED(unicode))
1582 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001583 if (!PyUnicode_CheckExact(unicode))
1584 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001585#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001586 /* singleton refcount is greater than 1 */
1587 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001588#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001589 return 1;
1590}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001591
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592static int
1593unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1594{
1595 PyObject *unicode;
1596 Py_ssize_t old_length;
1597
1598 assert(p_unicode != NULL);
1599 unicode = *p_unicode;
1600
1601 assert(unicode != NULL);
1602 assert(PyUnicode_Check(unicode));
1603 assert(0 <= length);
1604
Victor Stinner910337b2011-10-03 03:20:16 +02001605 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001606 old_length = PyUnicode_WSTR_LENGTH(unicode);
1607 else
1608 old_length = PyUnicode_GET_LENGTH(unicode);
1609 if (old_length == length)
1610 return 0;
1611
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001612 if (length == 0) {
1613 Py_DECREF(*p_unicode);
1614 *p_unicode = unicode_empty;
1615 Py_INCREF(*p_unicode);
1616 return 0;
1617 }
1618
Victor Stinner488fa492011-12-12 00:01:39 +01001619 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 PyObject *copy = resize_copy(unicode, length);
1621 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001622 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 Py_DECREF(*p_unicode);
1624 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001625 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001626 }
1627
Victor Stinnerfe226c02011-10-03 03:52:20 +02001628 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001629 PyObject *new_unicode = resize_compact(unicode, length);
1630 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001632 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001633 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001634 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001635 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001636}
1637
Alexander Belopolsky40018472011-02-26 01:02:56 +00001638int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001639PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001640{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001641 PyObject *unicode;
1642 if (p_unicode == NULL) {
1643 PyErr_BadInternalCall();
1644 return -1;
1645 }
1646 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001647 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001648 {
1649 PyErr_BadInternalCall();
1650 return -1;
1651 }
1652 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001653}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001654
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001656unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1657 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001658{
1659 PyObject *result;
1660 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001661 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001662 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1663 return 0;
1664 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1665 maxchar);
1666 if (result == NULL)
1667 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001668 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001669 Py_DECREF(*p_unicode);
1670 *p_unicode = result;
1671 return 0;
1672}
1673
1674static int
1675unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1676 Py_UCS4 ch)
1677{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001678 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001679 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001680 return -1;
1681 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1682 PyUnicode_DATA(*p_unicode),
1683 (*pos)++, ch);
1684 return 0;
1685}
1686
Victor Stinnerc5166102012-02-22 13:55:02 +01001687/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001688
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001689 WARNING: The function doesn't copy the terminating null character and
1690 doesn't check the maximum character (may write a latin1 character in an
1691 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001692static void
1693unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1694 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001695{
1696 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1697 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001698 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001699
1700 switch (kind) {
1701 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001702 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001703#ifdef Py_DEBUG
1704 if (PyUnicode_IS_ASCII(unicode)) {
1705 Py_UCS4 maxchar = ucs1lib_find_max_char(
1706 (const Py_UCS1*)str,
1707 (const Py_UCS1*)str + len);
1708 assert(maxchar < 128);
1709 }
1710#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001711 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001712 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001713 }
1714 case PyUnicode_2BYTE_KIND: {
1715 Py_UCS2 *start = (Py_UCS2 *)data + index;
1716 Py_UCS2 *ucs2 = start;
1717 assert(index <= PyUnicode_GET_LENGTH(unicode));
1718
Victor Stinner184252a2012-06-16 02:57:41 +02001719 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001720 *ucs2 = (Py_UCS2)*str;
1721
1722 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001723 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001724 }
1725 default: {
1726 Py_UCS4 *start = (Py_UCS4 *)data + index;
1727 Py_UCS4 *ucs4 = start;
1728 assert(kind == PyUnicode_4BYTE_KIND);
1729 assert(index <= PyUnicode_GET_LENGTH(unicode));
1730
Victor Stinner184252a2012-06-16 02:57:41 +02001731 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001732 *ucs4 = (Py_UCS4)*str;
1733
1734 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001735 }
1736 }
1737}
1738
1739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740static PyObject*
1741get_latin1_char(unsigned char ch)
1742{
Victor Stinnera464fc12011-10-02 20:39:30 +02001743 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001745 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746 if (!unicode)
1747 return NULL;
1748 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001749 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 unicode_latin1[ch] = unicode;
1751 }
1752 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001753 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754}
1755
Alexander Belopolsky40018472011-02-26 01:02:56 +00001756PyObject *
1757PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001759 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 Py_UCS4 maxchar = 0;
1761 Py_ssize_t num_surrogates;
1762
1763 if (u == NULL)
1764 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001766 /* If the Unicode data is known at construction time, we can apply
1767 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 /* Optimization for empty strings */
1770 if (size == 0 && unicode_empty != NULL) {
1771 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001772 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001773 }
Tim Petersced69f82003-09-16 20:30:58 +00001774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 /* Single character Unicode objects in the Latin-1 range are
1776 shared when using this constructor */
1777 if (size == 1 && *u < 256)
1778 return get_latin1_char((unsigned char)*u);
1779
1780 /* If not empty and not single character, copy the Unicode data
1781 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001782 if (find_maxchar_surrogates(u, u + size,
1783 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 return NULL;
1785
Victor Stinner8faf8212011-12-08 22:14:11 +01001786 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 if (!unicode)
1788 return NULL;
1789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 switch (PyUnicode_KIND(unicode)) {
1791 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001792 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1794 break;
1795 case PyUnicode_2BYTE_KIND:
1796#if Py_UNICODE_SIZE == 2
1797 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1798#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001799 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1801#endif
1802 break;
1803 case PyUnicode_4BYTE_KIND:
1804#if SIZEOF_WCHAR_T == 2
1805 /* This is the only case which has to process surrogates, thus
1806 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001807 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808#else
1809 assert(num_surrogates == 0);
1810 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1811#endif
1812 break;
1813 default:
1814 assert(0 && "Impossible state");
1815 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001817 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001818}
1819
Alexander Belopolsky40018472011-02-26 01:02:56 +00001820PyObject *
1821PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001822{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001823 if (size < 0) {
1824 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001825 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001826 return NULL;
1827 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001828 if (u != NULL)
1829 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1830 else
1831 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001832}
1833
Alexander Belopolsky40018472011-02-26 01:02:56 +00001834PyObject *
1835PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001836{
1837 size_t size = strlen(u);
1838 if (size > PY_SSIZE_T_MAX) {
1839 PyErr_SetString(PyExc_OverflowError, "input too long");
1840 return NULL;
1841 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001842 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001843}
1844
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001845PyObject *
1846_PyUnicode_FromId(_Py_Identifier *id)
1847{
1848 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001849 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1850 strlen(id->string),
1851 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001852 if (!id->object)
1853 return NULL;
1854 PyUnicode_InternInPlace(&id->object);
1855 assert(!id->next);
1856 id->next = static_strings;
1857 static_strings = id;
1858 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001859 return id->object;
1860}
1861
1862void
1863_PyUnicode_ClearStaticStrings()
1864{
1865 _Py_Identifier *i;
1866 for (i = static_strings; i; i = i->next) {
1867 Py_DECREF(i->object);
1868 i->object = NULL;
1869 i->next = NULL;
1870 }
1871}
1872
Benjamin Peterson0df54292012-03-26 14:50:32 -04001873/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001874
Victor Stinnerd3f08822012-05-29 12:57:52 +02001875PyObject*
1876_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001877{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001878 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001879 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001880 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001881#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001882 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001883#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001884 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001885 }
Victor Stinner785938e2011-12-11 20:09:03 +01001886 unicode = PyUnicode_New(size, 127);
1887 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001888 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001889 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1890 assert(_PyUnicode_CheckConsistency(unicode, 1));
1891 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001892}
1893
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001894static Py_UCS4
1895kind_maxchar_limit(unsigned int kind)
1896{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001897 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001898 case PyUnicode_1BYTE_KIND:
1899 return 0x80;
1900 case PyUnicode_2BYTE_KIND:
1901 return 0x100;
1902 case PyUnicode_4BYTE_KIND:
1903 return 0x10000;
1904 default:
1905 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001906 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001907 }
1908}
1909
Victor Stinnere6abb482012-05-02 01:15:40 +02001910Py_LOCAL_INLINE(Py_UCS4)
1911align_maxchar(Py_UCS4 maxchar)
1912{
1913 if (maxchar <= 127)
1914 return 127;
1915 else if (maxchar <= 255)
1916 return 255;
1917 else if (maxchar <= 65535)
1918 return 65535;
1919 else
1920 return MAX_UNICODE;
1921}
1922
Victor Stinner702c7342011-10-05 13:50:52 +02001923static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001924_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001927 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001928
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001929 if (size == 0) {
1930 Py_INCREF(unicode_empty);
1931 return unicode_empty;
1932 }
1933 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001934 if (size == 1)
1935 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001936
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001937 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001938 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 if (!res)
1940 return NULL;
1941 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001942 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001944}
1945
Victor Stinnere57b1c02011-09-28 22:20:48 +02001946static PyObject*
1947_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948{
1949 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001950 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001951
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001952 if (size == 0) {
1953 Py_INCREF(unicode_empty);
1954 return unicode_empty;
1955 }
1956 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001957 if (size == 1) {
1958 Py_UCS4 ch = u[0];
1959 if (ch < 256)
1960 return get_latin1_char((unsigned char)ch);
1961
1962 res = PyUnicode_New(1, ch);
1963 if (res == NULL)
1964 return NULL;
1965 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1966 assert(_PyUnicode_CheckConsistency(res, 1));
1967 return res;
1968 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001969
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001970 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001971 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001972 if (!res)
1973 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001974 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001975 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001976 else {
1977 _PyUnicode_CONVERT_BYTES(
1978 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1979 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001980 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 return res;
1982}
1983
Victor Stinnere57b1c02011-09-28 22:20:48 +02001984static PyObject*
1985_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986{
1987 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001988 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001989
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001990 if (size == 0) {
1991 Py_INCREF(unicode_empty);
1992 return unicode_empty;
1993 }
1994 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001995 if (size == 1) {
1996 Py_UCS4 ch = u[0];
1997 if (ch < 256)
1998 return get_latin1_char((unsigned char)ch);
1999
2000 res = PyUnicode_New(1, ch);
2001 if (res == NULL)
2002 return NULL;
2003 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
2004 assert(_PyUnicode_CheckConsistency(res, 1));
2005 return res;
2006 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002007
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002008 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002009 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 if (!res)
2011 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002012 if (max_char < 256)
2013 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2014 PyUnicode_1BYTE_DATA(res));
2015 else if (max_char < 0x10000)
2016 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2017 PyUnicode_2BYTE_DATA(res));
2018 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002020 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 return res;
2022}
2023
2024PyObject*
2025PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2026{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002027 if (size < 0) {
2028 PyErr_SetString(PyExc_ValueError, "size must be positive");
2029 return NULL;
2030 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002031 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002033 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002035 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002037 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002038 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002039 PyErr_SetString(PyExc_SystemError, "invalid kind");
2040 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042}
2043
Victor Stinnerece58de2012-04-23 23:36:38 +02002044Py_UCS4
2045_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2046{
2047 enum PyUnicode_Kind kind;
2048 void *startptr, *endptr;
2049
2050 assert(PyUnicode_IS_READY(unicode));
2051 assert(0 <= start);
2052 assert(end <= PyUnicode_GET_LENGTH(unicode));
2053 assert(start <= end);
2054
2055 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2056 return PyUnicode_MAX_CHAR_VALUE(unicode);
2057
2058 if (start == end)
2059 return 127;
2060
Victor Stinner94d558b2012-04-27 22:26:58 +02002061 if (PyUnicode_IS_ASCII(unicode))
2062 return 127;
2063
Victor Stinnerece58de2012-04-23 23:36:38 +02002064 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002065 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002066 endptr = (char *)startptr + end * kind;
2067 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002068 switch(kind) {
2069 case PyUnicode_1BYTE_KIND:
2070 return ucs1lib_find_max_char(startptr, endptr);
2071 case PyUnicode_2BYTE_KIND:
2072 return ucs2lib_find_max_char(startptr, endptr);
2073 case PyUnicode_4BYTE_KIND:
2074 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002075 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002076 assert(0);
2077 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002078 }
2079}
2080
Victor Stinner25a4b292011-10-06 12:31:55 +02002081/* Ensure that a string uses the most efficient storage, if it is not the
2082 case: create a new string with of the right kind. Write NULL into *p_unicode
2083 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002084static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002085unicode_adjust_maxchar(PyObject **p_unicode)
2086{
2087 PyObject *unicode, *copy;
2088 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002089 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002090 unsigned int kind;
2091
2092 assert(p_unicode != NULL);
2093 unicode = *p_unicode;
2094 assert(PyUnicode_IS_READY(unicode));
2095 if (PyUnicode_IS_ASCII(unicode))
2096 return;
2097
2098 len = PyUnicode_GET_LENGTH(unicode);
2099 kind = PyUnicode_KIND(unicode);
2100 if (kind == PyUnicode_1BYTE_KIND) {
2101 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002102 max_char = ucs1lib_find_max_char(u, u + len);
2103 if (max_char >= 128)
2104 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002105 }
2106 else if (kind == PyUnicode_2BYTE_KIND) {
2107 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002108 max_char = ucs2lib_find_max_char(u, u + len);
2109 if (max_char >= 256)
2110 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002111 }
2112 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002113 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002114 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002115 max_char = ucs4lib_find_max_char(u, u + len);
2116 if (max_char >= 0x10000)
2117 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002118 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002119 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002120 if (copy != NULL)
2121 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002122 Py_DECREF(unicode);
2123 *p_unicode = copy;
2124}
2125
Victor Stinner034f6cf2011-09-30 02:26:44 +02002126PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002127_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002128{
Victor Stinner87af4f22011-11-21 23:03:47 +01002129 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002130 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002131
Victor Stinner034f6cf2011-09-30 02:26:44 +02002132 if (!PyUnicode_Check(unicode)) {
2133 PyErr_BadInternalCall();
2134 return NULL;
2135 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002136 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002137 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002138
Victor Stinner87af4f22011-11-21 23:03:47 +01002139 length = PyUnicode_GET_LENGTH(unicode);
2140 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002141 if (!copy)
2142 return NULL;
2143 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2144
Victor Stinner87af4f22011-11-21 23:03:47 +01002145 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2146 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002147 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002148 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002149}
2150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151
Victor Stinnerbc603d12011-10-02 01:00:40 +02002152/* Widen Unicode objects to larger buffers. Don't write terminating null
2153 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002154
2155void*
2156_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2157{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002158 Py_ssize_t len;
2159 void *result;
2160 unsigned int skind;
2161
Benjamin Petersonbac79492012-01-14 13:34:47 -05002162 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002163 return NULL;
2164
2165 len = PyUnicode_GET_LENGTH(s);
2166 skind = PyUnicode_KIND(s);
2167 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002168 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 return NULL;
2170 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002171 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002172 case PyUnicode_2BYTE_KIND:
2173 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2174 if (!result)
2175 return PyErr_NoMemory();
2176 assert(skind == PyUnicode_1BYTE_KIND);
2177 _PyUnicode_CONVERT_BYTES(
2178 Py_UCS1, Py_UCS2,
2179 PyUnicode_1BYTE_DATA(s),
2180 PyUnicode_1BYTE_DATA(s) + len,
2181 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002183 case PyUnicode_4BYTE_KIND:
2184 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2185 if (!result)
2186 return PyErr_NoMemory();
2187 if (skind == PyUnicode_2BYTE_KIND) {
2188 _PyUnicode_CONVERT_BYTES(
2189 Py_UCS2, Py_UCS4,
2190 PyUnicode_2BYTE_DATA(s),
2191 PyUnicode_2BYTE_DATA(s) + len,
2192 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002194 else {
2195 assert(skind == PyUnicode_1BYTE_KIND);
2196 _PyUnicode_CONVERT_BYTES(
2197 Py_UCS1, Py_UCS4,
2198 PyUnicode_1BYTE_DATA(s),
2199 PyUnicode_1BYTE_DATA(s) + len,
2200 result);
2201 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002202 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002203 default:
2204 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 }
Victor Stinner01698042011-10-04 00:04:26 +02002206 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002207 return NULL;
2208}
2209
2210static Py_UCS4*
2211as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2212 int copy_null)
2213{
2214 int kind;
2215 void *data;
2216 Py_ssize_t len, targetlen;
2217 if (PyUnicode_READY(string) == -1)
2218 return NULL;
2219 kind = PyUnicode_KIND(string);
2220 data = PyUnicode_DATA(string);
2221 len = PyUnicode_GET_LENGTH(string);
2222 targetlen = len;
2223 if (copy_null)
2224 targetlen++;
2225 if (!target) {
2226 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2227 PyErr_NoMemory();
2228 return NULL;
2229 }
2230 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2231 if (!target) {
2232 PyErr_NoMemory();
2233 return NULL;
2234 }
2235 }
2236 else {
2237 if (targetsize < targetlen) {
2238 PyErr_Format(PyExc_SystemError,
2239 "string is longer than the buffer");
2240 if (copy_null && 0 < targetsize)
2241 target[0] = 0;
2242 return NULL;
2243 }
2244 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002245 if (kind == PyUnicode_1BYTE_KIND) {
2246 Py_UCS1 *start = (Py_UCS1 *) data;
2247 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002249 else if (kind == PyUnicode_2BYTE_KIND) {
2250 Py_UCS2 *start = (Py_UCS2 *) data;
2251 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2252 }
2253 else {
2254 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002255 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002257 if (copy_null)
2258 target[len] = 0;
2259 return target;
2260}
2261
2262Py_UCS4*
2263PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2264 int copy_null)
2265{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002266 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267 PyErr_BadInternalCall();
2268 return NULL;
2269 }
2270 return as_ucs4(string, target, targetsize, copy_null);
2271}
2272
2273Py_UCS4*
2274PyUnicode_AsUCS4Copy(PyObject *string)
2275{
2276 return as_ucs4(string, NULL, 0, 1);
2277}
2278
2279#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002280
Alexander Belopolsky40018472011-02-26 01:02:56 +00002281PyObject *
2282PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002285 if (size == 0) {
2286 Py_INCREF(unicode_empty);
2287 return unicode_empty;
2288 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002289 PyErr_BadInternalCall();
2290 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 }
2292
Martin v. Löwis790465f2008-04-05 20:41:37 +00002293 if (size == -1) {
2294 size = wcslen(w);
2295 }
2296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002297 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298}
2299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002300#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002301
Walter Dörwald346737f2007-05-31 10:44:43 +00002302static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002303makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
Victor Stinnere215d962012-10-06 23:03:36 +02002304 char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002305{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002306 *fmt++ = '%';
Benjamin Peterson14339b62009-01-31 16:36:08 +00002307 if (longflag)
2308 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002309 else if (longlongflag) {
2310 /* longlongflag should only ever be nonzero on machines with
2311 HAVE_LONG_LONG defined */
2312#ifdef HAVE_LONG_LONG
2313 char *f = PY_FORMAT_LONG_LONG;
2314 while (*f)
2315 *fmt++ = *f++;
2316#else
2317 /* we shouldn't ever get here */
2318 assert(0);
2319 *fmt++ = 'l';
2320#endif
2321 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002322 else if (size_tflag) {
2323 char *f = PY_FORMAT_SIZE_T;
2324 while (*f)
2325 *fmt++ = *f++;
2326 }
2327 *fmt++ = c;
2328 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002329}
2330
Victor Stinnere215d962012-10-06 23:03:36 +02002331/* maximum number of characters required for output of %ld. 21 characters
2332 allows for 64-bit integers (in decimal) and an optional sign. */
2333#define MAX_LONG_CHARS 21
2334/* maximum number of characters required for output of %lld.
2335 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2336 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2337#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002338
2339static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002340unicode_fromformat_arg(_PyUnicodeWriter *writer,
2341 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002342{
Victor Stinnere215d962012-10-06 23:03:36 +02002343 const char *p;
2344 Py_ssize_t len;
2345 int zeropad;
2346 int width;
2347 int precision;
2348 int longflag;
2349 int longlongflag;
2350 int size_tflag;
2351 int fill;
2352
2353 p = f;
2354 f++;
2355 zeropad = (*f == '0');
Victor Stinner96865452011-03-01 23:44:09 +00002356
2357 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner96865452011-03-01 23:44:09 +00002358 width = 0;
Victor Stinnere215d962012-10-06 23:03:36 +02002359 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002360 if (width > (INT_MAX - ((int)*f - '0')) / 10) {
2361 PyErr_SetString(PyExc_ValueError,
2362 "width too big");
2363 return NULL;
2364 }
Victor Stinnere215d962012-10-06 23:03:36 +02002365 width = (width*10) + (*f - '0');
2366 f++;
2367 }
Victor Stinner96865452011-03-01 23:44:09 +00002368 precision = 0;
2369 if (*f == '.') {
2370 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002371 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner3921e902012-10-06 23:05:00 +02002372 if (precision > (INT_MAX - ((int)*f - '0')) / 10) {
2373 PyErr_SetString(PyExc_ValueError,
2374 "precision too big");
2375 return NULL;
2376 }
Victor Stinnere215d962012-10-06 23:03:36 +02002377 precision = (precision*10) + (*f - '0');
2378 f++;
2379 }
Victor Stinner96865452011-03-01 23:44:09 +00002380 if (*f == '%') {
2381 /* "%.3%s" => f points to "3" */
2382 f--;
2383 }
2384 }
2385 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002386 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002387 f--;
2388 }
Victor Stinner96865452011-03-01 23:44:09 +00002389
2390 /* Handle %ld, %lu, %lld and %llu. */
2391 longflag = 0;
2392 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002393 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002394 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002395 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002396 longflag = 1;
2397 ++f;
2398 }
2399#ifdef HAVE_LONG_LONG
2400 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002401 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002402 longlongflag = 1;
2403 f += 2;
2404 }
2405#endif
2406 }
2407 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002408 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002409 size_tflag = 1;
2410 ++f;
2411 }
Victor Stinnere215d962012-10-06 23:03:36 +02002412
2413 if (f[1] == '\0')
2414 writer->overallocate = 0;
2415
2416 switch (*f) {
2417 case 'c':
2418 {
2419 int ordinal = va_arg(*vargs, int);
2420 if (_PyUnicodeWriter_Prepare(writer, 1, ordinal) == -1)
2421 return NULL;
2422 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ordinal);
2423 writer->pos++;
2424 break;
2425 }
2426
2427 case 'i':
2428 case 'd':
2429 case 'u':
2430 case 'x':
2431 {
2432 /* used by sprintf */
2433 char fmt[10]; /* should be enough for "%0lld\0" */
2434 char small_buffer[MAX_LONG_CHARS];
2435 char *buffer;
2436 int err;
2437
2438 if (sizeof(small_buffer) - 1 < precision) {
2439 buffer = PyMem_Malloc(precision + 1);
2440 if (buffer == NULL) {
2441 PyErr_NoMemory();
2442 return NULL;
2443 }
2444 }
2445 else
2446 buffer = small_buffer;
2447
2448 if (*f == 'u') {
2449 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2450
2451 if (longflag)
2452 len = sprintf(buffer, fmt,
2453 va_arg(*vargs, unsigned long));
2454#ifdef HAVE_LONG_LONG
2455 else if (longlongflag)
2456 len = sprintf(buffer, fmt,
2457 va_arg(*vargs, unsigned PY_LONG_LONG));
2458#endif
2459 else if (size_tflag)
2460 len = sprintf(buffer, fmt,
2461 va_arg(*vargs, size_t));
2462 else
2463 len = sprintf(buffer, fmt,
2464 va_arg(*vargs, unsigned int));
2465 }
2466 else if (*f == 'x') {
2467 makefmt(fmt, 0, 0, 0, 'x');
2468 len = sprintf(buffer, fmt, va_arg(*vargs, int));
2469 }
2470 else {
2471 makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2472
2473 if (longflag)
2474 len = sprintf(buffer, fmt,
2475 va_arg(*vargs, long));
2476#ifdef HAVE_LONG_LONG
2477 else if (longlongflag)
2478 len = sprintf(buffer, fmt,
2479 va_arg(*vargs, PY_LONG_LONG));
2480#endif
2481 else if (size_tflag)
2482 len = sprintf(buffer, fmt,
2483 va_arg(*vargs, Py_ssize_t));
2484 else
2485 len = sprintf(buffer, fmt,
2486 va_arg(*vargs, int));
2487 }
2488 assert(len >= 0);
2489
2490 err = 0;
2491 if (precision < len)
2492 precision = len;
2493 if (width > precision) {
2494 Py_UCS4 fillchar;
2495 fill = width - precision;
2496 fillchar = zeropad?'0':' ';
2497 if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) != -1) {
2498 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2499 err = 1;
2500 }
2501 else
2502 err = 1;
2503 if (!err)
2504 writer->pos += fill;
2505 }
2506 if (!err && precision > len) {
2507 fill = precision - len;
2508 if (_PyUnicodeWriter_Prepare(writer, fill, '0') != -1) {
2509 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2510 err = 1;
2511 }
2512 else
2513 err = 1;
2514 if (!err)
2515 writer->pos += fill;
2516 }
2517 if (!err) {
2518 if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1)
2519 err = 1;
2520 }
2521
2522 if (buffer != small_buffer) {
2523 PyMem_Free(buffer);
2524 buffer = small_buffer;
2525 }
2526 if (err)
2527 return NULL;
2528
2529 break;
2530 }
2531
2532 case 'p':
2533 {
2534 char number[MAX_LONG_LONG_CHARS];
2535
2536 len = sprintf(number, "%p", va_arg(*vargs, void*));
2537 assert(len >= 0);
2538
2539 /* %p is ill-defined: ensure leading 0x. */
2540 if (number[1] == 'X')
2541 number[1] = 'x';
2542 else if (number[1] != 'x') {
2543 memmove(number + 2, number,
2544 strlen(number) + 1);
2545 number[0] = '0';
2546 number[1] = 'x';
2547 len += 2;
2548 }
2549
2550 if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1)
2551 return NULL;
2552 break;
2553 }
2554
2555 case 's':
2556 {
2557 /* UTF-8 */
2558 const char *s = va_arg(*vargs, const char*);
2559 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
2560 if (!str)
2561 return NULL;
2562 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2563 Py_DECREF(str);
2564 return NULL;
2565 }
2566 Py_DECREF(str);
2567 break;
2568 }
2569
2570 case 'U':
2571 {
2572 PyObject *obj = va_arg(*vargs, PyObject *);
2573 assert(obj && _PyUnicode_CHECK(obj));
2574
2575 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2576 return NULL;
2577 break;
2578 }
2579
2580 case 'V':
2581 {
2582 PyObject *obj = va_arg(*vargs, PyObject *);
2583 const char *str = va_arg(*vargs, const char *);
2584 PyObject *str_obj;
2585 assert(obj || str);
2586 if (obj) {
2587 assert(_PyUnicode_CHECK(obj));
2588 if (_PyUnicodeWriter_WriteStr(writer, obj) == -1)
2589 return NULL;
2590 }
2591 else {
2592 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
2593 if (!str_obj)
2594 return NULL;
2595 if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) {
2596 Py_DECREF(str_obj);
2597 return NULL;
2598 }
2599 Py_DECREF(str_obj);
2600 }
2601 break;
2602 }
2603
2604 case 'S':
2605 {
2606 PyObject *obj = va_arg(*vargs, PyObject *);
2607 PyObject *str;
2608 assert(obj);
2609 str = PyObject_Str(obj);
2610 if (!str)
2611 return NULL;
2612 if (_PyUnicodeWriter_WriteStr(writer, str) == -1) {
2613 Py_DECREF(str);
2614 return NULL;
2615 }
2616 Py_DECREF(str);
2617 break;
2618 }
2619
2620 case 'R':
2621 {
2622 PyObject *obj = va_arg(*vargs, PyObject *);
2623 PyObject *repr;
2624 assert(obj);
2625 repr = PyObject_Repr(obj);
2626 if (!repr)
2627 return NULL;
2628 if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) {
2629 Py_DECREF(repr);
2630 return NULL;
2631 }
2632 Py_DECREF(repr);
2633 break;
2634 }
2635
2636 case 'A':
2637 {
2638 PyObject *obj = va_arg(*vargs, PyObject *);
2639 PyObject *ascii;
2640 assert(obj);
2641 ascii = PyObject_ASCII(obj);
2642 if (!ascii)
2643 return NULL;
2644 if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) {
2645 Py_DECREF(ascii);
2646 return NULL;
2647 }
2648 Py_DECREF(ascii);
2649 break;
2650 }
2651
2652 case '%':
2653 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == 1)
2654 return NULL;
2655 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
2656 writer->pos++;
2657 break;
2658
2659 default:
2660 /* if we stumble upon an unknown formatting code, copy the rest
2661 of the format string to the output string. (we cannot just
2662 skip the code, since there's no way to know what's in the
2663 argument list) */
2664 len = strlen(p);
2665 if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2666 return NULL;
2667 f = p+len;
2668 return f;
2669 }
2670
2671 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002672 return f;
2673}
2674
Walter Dörwaldd2034312007-05-18 16:29:38 +00002675PyObject *
2676PyUnicode_FromFormatV(const char *format, va_list vargs)
2677{
Victor Stinnere215d962012-10-06 23:03:36 +02002678 va_list vargs2;
2679 const char *f;
2680 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002681
Victor Stinnere215d962012-10-06 23:03:36 +02002682 _PyUnicodeWriter_Init(&writer, strlen(format) + 100);
2683
2684 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2685 Copy it to be able to pass a reference to a subfunction. */
2686 Py_VA_COPY(vargs2, vargs);
2687
2688 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002689 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002690 f = unicode_fromformat_arg(&writer, f, &vargs2);
2691 if (f == NULL)
2692 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002694 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002695 const char *p;
2696 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002697
Victor Stinnere215d962012-10-06 23:03:36 +02002698 p = f;
2699 do
2700 {
2701 if ((unsigned char)*p > 127) {
2702 PyErr_Format(PyExc_ValueError,
2703 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2704 "string, got a non-ASCII byte: 0x%02x",
2705 (unsigned char)*p);
2706 return NULL;
2707 }
2708 p++;
2709 }
2710 while (*p != '\0' && *p != '%');
2711 len = p - f;
2712
2713 if (*p == '\0')
2714 writer.overallocate = 0;
2715 if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2716 goto fail;
2717 unicode_write_cstr(writer.buffer, writer.pos, f, len);
2718 writer.pos += len;
2719
2720 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002721 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002722 }
Victor Stinnere215d962012-10-06 23:03:36 +02002723 return _PyUnicodeWriter_Finish(&writer);
2724
2725 fail:
2726 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002727 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002728}
2729
Walter Dörwaldd2034312007-05-18 16:29:38 +00002730PyObject *
2731PyUnicode_FromFormat(const char *format, ...)
2732{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002733 PyObject* ret;
2734 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002735
2736#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002737 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002738#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002739 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002740#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002741 ret = PyUnicode_FromFormatV(format, vargs);
2742 va_end(vargs);
2743 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002744}
2745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002746#ifdef HAVE_WCHAR_H
2747
Victor Stinner5593d8a2010-10-02 11:11:27 +00002748/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2749 convert a Unicode object to a wide character string.
2750
Victor Stinnerd88d9832011-09-06 02:00:05 +02002751 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002752 character) required to convert the unicode object. Ignore size argument.
2753
Victor Stinnerd88d9832011-09-06 02:00:05 +02002754 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002755 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002756 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002757static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002758unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002759 wchar_t *w,
2760 Py_ssize_t size)
2761{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002762 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002763 const wchar_t *wstr;
2764
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002765 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002766 if (wstr == NULL)
2767 return -1;
2768
Victor Stinner5593d8a2010-10-02 11:11:27 +00002769 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002770 if (size > res)
2771 size = res + 1;
2772 else
2773 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002774 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002775 return res;
2776 }
2777 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002778 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002779}
2780
2781Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002782PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002783 wchar_t *w,
2784 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785{
2786 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002787 PyErr_BadInternalCall();
2788 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002790 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791}
2792
Victor Stinner137c34c2010-09-29 10:25:54 +00002793wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002794PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002795 Py_ssize_t *size)
2796{
2797 wchar_t* buffer;
2798 Py_ssize_t buflen;
2799
2800 if (unicode == NULL) {
2801 PyErr_BadInternalCall();
2802 return NULL;
2803 }
2804
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002805 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002806 if (buflen == -1)
2807 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002808 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002809 PyErr_NoMemory();
2810 return NULL;
2811 }
2812
Victor Stinner137c34c2010-09-29 10:25:54 +00002813 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2814 if (buffer == NULL) {
2815 PyErr_NoMemory();
2816 return NULL;
2817 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002818 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002819 if (buflen == -1) {
2820 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002821 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002822 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002823 if (size != NULL)
2824 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002825 return buffer;
2826}
2827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002828#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829
Alexander Belopolsky40018472011-02-26 01:02:56 +00002830PyObject *
2831PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002832{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002833 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002834 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002835 PyErr_SetString(PyExc_ValueError,
2836 "chr() arg not in range(0x110000)");
2837 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002838 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002840 if (ordinal < 256)
2841 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002843 v = PyUnicode_New(1, ordinal);
2844 if (v == NULL)
2845 return NULL;
2846 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002847 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002848 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002849}
2850
Alexander Belopolsky40018472011-02-26 01:02:56 +00002851PyObject *
2852PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002853{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002854 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002855 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002856 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002857 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002858 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002859 Py_INCREF(obj);
2860 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002861 }
2862 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002863 /* For a Unicode subtype that's not a Unicode object,
2864 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002865 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002866 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002867 PyErr_Format(PyExc_TypeError,
2868 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002869 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002870 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002871}
2872
Alexander Belopolsky40018472011-02-26 01:02:56 +00002873PyObject *
2874PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002875 const char *encoding,
2876 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002877{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002878 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002879 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002880
Guido van Rossumd57fd912000-03-10 22:53:23 +00002881 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002882 PyErr_BadInternalCall();
2883 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002885
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002886 /* Decoding bytes objects is the most common case and should be fast */
2887 if (PyBytes_Check(obj)) {
2888 if (PyBytes_GET_SIZE(obj) == 0) {
2889 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002890 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002891 }
2892 else {
2893 v = PyUnicode_Decode(
2894 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2895 encoding, errors);
2896 }
2897 return v;
2898 }
2899
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002900 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002901 PyErr_SetString(PyExc_TypeError,
2902 "decoding str is not supported");
2903 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002904 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002905
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002906 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2907 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2908 PyErr_Format(PyExc_TypeError,
2909 "coercing to str: need bytes, bytearray "
2910 "or buffer-like object, %.80s found",
2911 Py_TYPE(obj)->tp_name);
2912 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002913 }
Tim Petersced69f82003-09-16 20:30:58 +00002914
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002915 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002916 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002917 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918 }
Tim Petersced69f82003-09-16 20:30:58 +00002919 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002920 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002921
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002922 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002923 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924}
2925
Victor Stinner600d3be2010-06-10 12:00:55 +00002926/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002927 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2928 1 on success. */
2929static int
2930normalize_encoding(const char *encoding,
2931 char *lower,
2932 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002933{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002934 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002935 char *l;
2936 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002937
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002938 if (encoding == NULL) {
2939 strcpy(lower, "utf-8");
2940 return 1;
2941 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002942 e = encoding;
2943 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002944 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002945 while (*e) {
2946 if (l == l_end)
2947 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002948 if (Py_ISUPPER(*e)) {
2949 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002950 }
2951 else if (*e == '_') {
2952 *l++ = '-';
2953 e++;
2954 }
2955 else {
2956 *l++ = *e++;
2957 }
2958 }
2959 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002960 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002961}
2962
Alexander Belopolsky40018472011-02-26 01:02:56 +00002963PyObject *
2964PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002965 Py_ssize_t size,
2966 const char *encoding,
2967 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002968{
2969 PyObject *buffer = NULL, *unicode;
2970 Py_buffer info;
2971 char lower[11]; /* Enough for any encoding shortcut */
2972
Fred Drakee4315f52000-05-09 19:53:39 +00002973 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002974 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002975 if ((strcmp(lower, "utf-8") == 0) ||
2976 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002977 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002978 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002979 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002980 (strcmp(lower, "iso-8859-1") == 0))
2981 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002982#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002983 else if (strcmp(lower, "mbcs") == 0)
2984 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002985#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002986 else if (strcmp(lower, "ascii") == 0)
2987 return PyUnicode_DecodeASCII(s, size, errors);
2988 else if (strcmp(lower, "utf-16") == 0)
2989 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2990 else if (strcmp(lower, "utf-32") == 0)
2991 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2992 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993
2994 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002995 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002996 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002997 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002998 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999 if (buffer == NULL)
3000 goto onError;
3001 unicode = PyCodec_Decode(buffer, encoding, errors);
3002 if (unicode == NULL)
3003 goto onError;
3004 if (!PyUnicode_Check(unicode)) {
3005 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003006 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003007 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008 Py_DECREF(unicode);
3009 goto onError;
3010 }
3011 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003012 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003013
Benjamin Peterson29060642009-01-31 22:14:21 +00003014 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003015 Py_XDECREF(buffer);
3016 return NULL;
3017}
3018
Alexander Belopolsky40018472011-02-26 01:02:56 +00003019PyObject *
3020PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003021 const char *encoding,
3022 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003023{
3024 PyObject *v;
3025
3026 if (!PyUnicode_Check(unicode)) {
3027 PyErr_BadArgument();
3028 goto onError;
3029 }
3030
3031 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003032 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003033
3034 /* Decode via the codec registry */
3035 v = PyCodec_Decode(unicode, encoding, errors);
3036 if (v == NULL)
3037 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003038 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003039
Benjamin Peterson29060642009-01-31 22:14:21 +00003040 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003041 return NULL;
3042}
3043
Alexander Belopolsky40018472011-02-26 01:02:56 +00003044PyObject *
3045PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003046 const char *encoding,
3047 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003048{
3049 PyObject *v;
3050
3051 if (!PyUnicode_Check(unicode)) {
3052 PyErr_BadArgument();
3053 goto onError;
3054 }
3055
3056 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003057 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003058
3059 /* Decode via the codec registry */
3060 v = PyCodec_Decode(unicode, encoding, errors);
3061 if (v == NULL)
3062 goto onError;
3063 if (!PyUnicode_Check(v)) {
3064 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003065 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003066 Py_TYPE(v)->tp_name);
3067 Py_DECREF(v);
3068 goto onError;
3069 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003070 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003071
Benjamin Peterson29060642009-01-31 22:14:21 +00003072 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003073 return NULL;
3074}
3075
Alexander Belopolsky40018472011-02-26 01:02:56 +00003076PyObject *
3077PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003078 Py_ssize_t size,
3079 const char *encoding,
3080 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081{
3082 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003083
Guido van Rossumd57fd912000-03-10 22:53:23 +00003084 unicode = PyUnicode_FromUnicode(s, size);
3085 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003086 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3088 Py_DECREF(unicode);
3089 return v;
3090}
3091
Alexander Belopolsky40018472011-02-26 01:02:56 +00003092PyObject *
3093PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003094 const char *encoding,
3095 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003096{
3097 PyObject *v;
3098
3099 if (!PyUnicode_Check(unicode)) {
3100 PyErr_BadArgument();
3101 goto onError;
3102 }
3103
3104 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003105 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003106
3107 /* Encode via the codec registry */
3108 v = PyCodec_Encode(unicode, encoding, errors);
3109 if (v == NULL)
3110 goto onError;
3111 return v;
3112
Benjamin Peterson29060642009-01-31 22:14:21 +00003113 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003114 return NULL;
3115}
3116
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003117static size_t
3118wcstombs_errorpos(const wchar_t *wstr)
3119{
3120 size_t len;
3121#if SIZEOF_WCHAR_T == 2
3122 wchar_t buf[3];
3123#else
3124 wchar_t buf[2];
3125#endif
3126 char outbuf[MB_LEN_MAX];
3127 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003128
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003129#if SIZEOF_WCHAR_T == 2
3130 buf[2] = 0;
3131#else
3132 buf[1] = 0;
3133#endif
3134 start = wstr;
3135 while (*wstr != L'\0')
3136 {
3137 previous = wstr;
3138#if SIZEOF_WCHAR_T == 2
3139 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3140 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3141 {
3142 buf[0] = wstr[0];
3143 buf[1] = wstr[1];
3144 wstr += 2;
3145 }
3146 else {
3147 buf[0] = *wstr;
3148 buf[1] = 0;
3149 wstr++;
3150 }
3151#else
3152 buf[0] = *wstr;
3153 wstr++;
3154#endif
3155 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003156 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003157 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003158 }
3159
3160 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003161 return 0;
3162}
3163
Victor Stinner1b579672011-12-17 05:47:23 +01003164static int
3165locale_error_handler(const char *errors, int *surrogateescape)
3166{
3167 if (errors == NULL) {
3168 *surrogateescape = 0;
3169 return 0;
3170 }
3171
3172 if (strcmp(errors, "strict") == 0) {
3173 *surrogateescape = 0;
3174 return 0;
3175 }
3176 if (strcmp(errors, "surrogateescape") == 0) {
3177 *surrogateescape = 1;
3178 return 0;
3179 }
3180 PyErr_Format(PyExc_ValueError,
3181 "only 'strict' and 'surrogateescape' error handlers "
3182 "are supported, not '%s'",
3183 errors);
3184 return -1;
3185}
3186
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003187PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003188PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003189{
3190 Py_ssize_t wlen, wlen2;
3191 wchar_t *wstr;
3192 PyObject *bytes = NULL;
3193 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003194 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003195 PyObject *exc;
3196 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003197 int surrogateescape;
3198
3199 if (locale_error_handler(errors, &surrogateescape) < 0)
3200 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003201
3202 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3203 if (wstr == NULL)
3204 return NULL;
3205
3206 wlen2 = wcslen(wstr);
3207 if (wlen2 != wlen) {
3208 PyMem_Free(wstr);
3209 PyErr_SetString(PyExc_TypeError, "embedded null character");
3210 return NULL;
3211 }
3212
3213 if (surrogateescape) {
3214 /* locale encoding with surrogateescape */
3215 char *str;
3216
3217 str = _Py_wchar2char(wstr, &error_pos);
3218 if (str == NULL) {
3219 if (error_pos == (size_t)-1) {
3220 PyErr_NoMemory();
3221 PyMem_Free(wstr);
3222 return NULL;
3223 }
3224 else {
3225 goto encode_error;
3226 }
3227 }
3228 PyMem_Free(wstr);
3229
3230 bytes = PyBytes_FromString(str);
3231 PyMem_Free(str);
3232 }
3233 else {
3234 size_t len, len2;
3235
3236 len = wcstombs(NULL, wstr, 0);
3237 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003238 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003239 goto encode_error;
3240 }
3241
3242 bytes = PyBytes_FromStringAndSize(NULL, len);
3243 if (bytes == NULL) {
3244 PyMem_Free(wstr);
3245 return NULL;
3246 }
3247
3248 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3249 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003250 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003251 goto encode_error;
3252 }
3253 PyMem_Free(wstr);
3254 }
3255 return bytes;
3256
3257encode_error:
3258 errmsg = strerror(errno);
3259 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003260
3261 if (error_pos == (size_t)-1)
3262 error_pos = wcstombs_errorpos(wstr);
3263
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003264 PyMem_Free(wstr);
3265 Py_XDECREF(bytes);
3266
Victor Stinner2f197072011-12-17 07:08:30 +01003267 if (errmsg != NULL) {
3268 size_t errlen;
3269 wstr = _Py_char2wchar(errmsg, &errlen);
3270 if (wstr != NULL) {
3271 reason = PyUnicode_FromWideChar(wstr, errlen);
3272 PyMem_Free(wstr);
3273 } else
3274 errmsg = NULL;
3275 }
3276 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003277 reason = PyUnicode_FromString(
3278 "wcstombs() encountered an unencodable "
3279 "wide character");
3280 if (reason == NULL)
3281 return NULL;
3282
3283 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3284 "locale", unicode,
3285 (Py_ssize_t)error_pos,
3286 (Py_ssize_t)(error_pos+1),
3287 reason);
3288 Py_DECREF(reason);
3289 if (exc != NULL) {
3290 PyCodec_StrictErrors(exc);
3291 Py_XDECREF(exc);
3292 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003293 return NULL;
3294}
3295
Victor Stinnerad158722010-10-27 00:25:46 +00003296PyObject *
3297PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003298{
Victor Stinner99b95382011-07-04 14:23:54 +02003299#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003300 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003301#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003302 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003303#else
Victor Stinner793b5312011-04-27 00:24:21 +02003304 PyInterpreterState *interp = PyThreadState_GET()->interp;
3305 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3306 cannot use it to encode and decode filenames before it is loaded. Load
3307 the Python codec requires to encode at least its own filename. Use the C
3308 version of the locale codec until the codec registry is initialized and
3309 the Python codec is loaded.
3310
3311 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3312 cannot only rely on it: check also interp->fscodec_initialized for
3313 subinterpreters. */
3314 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003315 return PyUnicode_AsEncodedString(unicode,
3316 Py_FileSystemDefaultEncoding,
3317 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003318 }
3319 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003320 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003321 }
Victor Stinnerad158722010-10-27 00:25:46 +00003322#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003323}
3324
Alexander Belopolsky40018472011-02-26 01:02:56 +00003325PyObject *
3326PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003327 const char *encoding,
3328 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003329{
3330 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003331 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003332
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333 if (!PyUnicode_Check(unicode)) {
3334 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003335 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336 }
Fred Drakee4315f52000-05-09 19:53:39 +00003337
Fred Drakee4315f52000-05-09 19:53:39 +00003338 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003339 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003340 if ((strcmp(lower, "utf-8") == 0) ||
3341 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003342 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003343 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003344 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003345 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003346 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003347 }
Victor Stinner37296e82010-06-10 13:36:23 +00003348 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003349 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003350 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003351 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003352#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003353 else if (strcmp(lower, "mbcs") == 0)
3354 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003355#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003356 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003357 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359
3360 /* Encode via the codec registry */
3361 v = PyCodec_Encode(unicode, encoding, errors);
3362 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003363 return NULL;
3364
3365 /* The normal path */
3366 if (PyBytes_Check(v))
3367 return v;
3368
3369 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003370 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003371 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003372 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003373
3374 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3375 "encoder %s returned bytearray instead of bytes",
3376 encoding);
3377 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003378 Py_DECREF(v);
3379 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003380 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003381
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003382 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3383 Py_DECREF(v);
3384 return b;
3385 }
3386
3387 PyErr_Format(PyExc_TypeError,
3388 "encoder did not return a bytes object (type=%.400s)",
3389 Py_TYPE(v)->tp_name);
3390 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003391 return NULL;
3392}
3393
Alexander Belopolsky40018472011-02-26 01:02:56 +00003394PyObject *
3395PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003396 const char *encoding,
3397 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003398{
3399 PyObject *v;
3400
3401 if (!PyUnicode_Check(unicode)) {
3402 PyErr_BadArgument();
3403 goto onError;
3404 }
3405
3406 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003407 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003408
3409 /* Encode via the codec registry */
3410 v = PyCodec_Encode(unicode, encoding, errors);
3411 if (v == NULL)
3412 goto onError;
3413 if (!PyUnicode_Check(v)) {
3414 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003415 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003416 Py_TYPE(v)->tp_name);
3417 Py_DECREF(v);
3418 goto onError;
3419 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003421
Benjamin Peterson29060642009-01-31 22:14:21 +00003422 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003423 return NULL;
3424}
3425
Victor Stinner2f197072011-12-17 07:08:30 +01003426static size_t
3427mbstowcs_errorpos(const char *str, size_t len)
3428{
3429#ifdef HAVE_MBRTOWC
3430 const char *start = str;
3431 mbstate_t mbs;
3432 size_t converted;
3433 wchar_t ch;
3434
3435 memset(&mbs, 0, sizeof mbs);
3436 while (len)
3437 {
3438 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3439 if (converted == 0)
3440 /* Reached end of string */
3441 break;
3442 if (converted == (size_t)-1 || converted == (size_t)-2) {
3443 /* Conversion error or incomplete character */
3444 return str - start;
3445 }
3446 else {
3447 str += converted;
3448 len -= converted;
3449 }
3450 }
3451 /* failed to find the undecodable byte sequence */
3452 return 0;
3453#endif
3454 return 0;
3455}
3456
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003457PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003458PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003459 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003460{
3461 wchar_t smallbuf[256];
3462 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3463 wchar_t *wstr;
3464 size_t wlen, wlen2;
3465 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003466 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003467 size_t error_pos;
3468 char *errmsg;
3469 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003470
3471 if (locale_error_handler(errors, &surrogateescape) < 0)
3472 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003473
3474 if (str[len] != '\0' || len != strlen(str)) {
3475 PyErr_SetString(PyExc_TypeError, "embedded null character");
3476 return NULL;
3477 }
3478
3479 if (surrogateescape)
3480 {
3481 wstr = _Py_char2wchar(str, &wlen);
3482 if (wstr == NULL) {
3483 if (wlen == (size_t)-1)
3484 PyErr_NoMemory();
3485 else
3486 PyErr_SetFromErrno(PyExc_OSError);
3487 return NULL;
3488 }
3489
3490 unicode = PyUnicode_FromWideChar(wstr, wlen);
3491 PyMem_Free(wstr);
3492 }
3493 else {
3494#ifndef HAVE_BROKEN_MBSTOWCS
3495 wlen = mbstowcs(NULL, str, 0);
3496#else
3497 wlen = len;
3498#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003499 if (wlen == (size_t)-1)
3500 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003501 if (wlen+1 <= smallbuf_len) {
3502 wstr = smallbuf;
3503 }
3504 else {
3505 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3506 return PyErr_NoMemory();
3507
3508 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3509 if (!wstr)
3510 return PyErr_NoMemory();
3511 }
3512
3513 /* This shouldn't fail now */
3514 wlen2 = mbstowcs(wstr, str, wlen+1);
3515 if (wlen2 == (size_t)-1) {
3516 if (wstr != smallbuf)
3517 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003518 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003519 }
3520#ifdef HAVE_BROKEN_MBSTOWCS
3521 assert(wlen2 == wlen);
3522#endif
3523 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3524 if (wstr != smallbuf)
3525 PyMem_Free(wstr);
3526 }
3527 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003528
3529decode_error:
3530 errmsg = strerror(errno);
3531 assert(errmsg != NULL);
3532
3533 error_pos = mbstowcs_errorpos(str, len);
3534 if (errmsg != NULL) {
3535 size_t errlen;
3536 wstr = _Py_char2wchar(errmsg, &errlen);
3537 if (wstr != NULL) {
3538 reason = PyUnicode_FromWideChar(wstr, errlen);
3539 PyMem_Free(wstr);
3540 } else
3541 errmsg = NULL;
3542 }
3543 if (errmsg == NULL)
3544 reason = PyUnicode_FromString(
3545 "mbstowcs() encountered an invalid multibyte sequence");
3546 if (reason == NULL)
3547 return NULL;
3548
3549 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3550 "locale", str, len,
3551 (Py_ssize_t)error_pos,
3552 (Py_ssize_t)(error_pos+1),
3553 reason);
3554 Py_DECREF(reason);
3555 if (exc != NULL) {
3556 PyCodec_StrictErrors(exc);
3557 Py_XDECREF(exc);
3558 }
3559 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003560}
3561
3562PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003563PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003564{
3565 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003566 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003567}
3568
3569
3570PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003571PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003572 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003573 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3574}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003575
Christian Heimes5894ba72007-11-04 11:43:14 +00003576PyObject*
3577PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3578{
Victor Stinner99b95382011-07-04 14:23:54 +02003579#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003580 return PyUnicode_DecodeMBCS(s, size, NULL);
3581#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003582 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003583#else
Victor Stinner793b5312011-04-27 00:24:21 +02003584 PyInterpreterState *interp = PyThreadState_GET()->interp;
3585 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3586 cannot use it to encode and decode filenames before it is loaded. Load
3587 the Python codec requires to encode at least its own filename. Use the C
3588 version of the locale codec until the codec registry is initialized and
3589 the Python codec is loaded.
3590
3591 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3592 cannot only rely on it: check also interp->fscodec_initialized for
3593 subinterpreters. */
3594 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003595 return PyUnicode_Decode(s, size,
3596 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003597 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003598 }
3599 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003600 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003601 }
Victor Stinnerad158722010-10-27 00:25:46 +00003602#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003603}
3604
Martin v. Löwis011e8422009-05-05 04:43:17 +00003605
3606int
Antoine Pitrou13348842012-01-29 18:36:34 +01003607_PyUnicode_HasNULChars(PyObject* s)
3608{
3609 static PyObject *nul = NULL;
3610
3611 if (nul == NULL)
3612 nul = PyUnicode_FromStringAndSize("\0", 1);
3613 if (nul == NULL)
3614 return -1;
3615 return PyUnicode_Contains(s, nul);
3616}
3617
3618
3619int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003620PyUnicode_FSConverter(PyObject* arg, void* addr)
3621{
3622 PyObject *output = NULL;
3623 Py_ssize_t size;
3624 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003625 if (arg == NULL) {
3626 Py_DECREF(*(PyObject**)addr);
3627 return 1;
3628 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003629 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003630 output = arg;
3631 Py_INCREF(output);
3632 }
3633 else {
3634 arg = PyUnicode_FromObject(arg);
3635 if (!arg)
3636 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003637 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003638 Py_DECREF(arg);
3639 if (!output)
3640 return 0;
3641 if (!PyBytes_Check(output)) {
3642 Py_DECREF(output);
3643 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3644 return 0;
3645 }
3646 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003647 size = PyBytes_GET_SIZE(output);
3648 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003649 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003650 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003651 Py_DECREF(output);
3652 return 0;
3653 }
3654 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003655 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003656}
3657
3658
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003659int
3660PyUnicode_FSDecoder(PyObject* arg, void* addr)
3661{
3662 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003663 if (arg == NULL) {
3664 Py_DECREF(*(PyObject**)addr);
3665 return 1;
3666 }
3667 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003668 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003669 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003670 output = arg;
3671 Py_INCREF(output);
3672 }
3673 else {
3674 arg = PyBytes_FromObject(arg);
3675 if (!arg)
3676 return 0;
3677 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3678 PyBytes_GET_SIZE(arg));
3679 Py_DECREF(arg);
3680 if (!output)
3681 return 0;
3682 if (!PyUnicode_Check(output)) {
3683 Py_DECREF(output);
3684 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3685 return 0;
3686 }
3687 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003688 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003689 Py_DECREF(output);
3690 return 0;
3691 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003692 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003693 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003694 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3695 Py_DECREF(output);
3696 return 0;
3697 }
3698 *(PyObject**)addr = output;
3699 return Py_CLEANUP_SUPPORTED;
3700}
3701
3702
Martin v. Löwis5b222132007-06-10 09:51:05 +00003703char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003704PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003705{
Christian Heimesf3863112007-11-22 07:46:41 +00003706 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003707
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003708 if (!PyUnicode_Check(unicode)) {
3709 PyErr_BadArgument();
3710 return NULL;
3711 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003712 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003713 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003714
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003715 if (PyUnicode_UTF8(unicode) == NULL) {
3716 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003717 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3718 if (bytes == NULL)
3719 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003720 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3721 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003722 Py_DECREF(bytes);
3723 return NULL;
3724 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003725 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3726 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3727 PyBytes_AS_STRING(bytes),
3728 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003729 Py_DECREF(bytes);
3730 }
3731
3732 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003733 *psize = PyUnicode_UTF8_LENGTH(unicode);
3734 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003735}
3736
3737char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003738PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003739{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003740 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3741}
3742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003743Py_UNICODE *
3744PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3745{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003746 const unsigned char *one_byte;
3747#if SIZEOF_WCHAR_T == 4
3748 const Py_UCS2 *two_bytes;
3749#else
3750 const Py_UCS4 *four_bytes;
3751 const Py_UCS4 *ucs4_end;
3752 Py_ssize_t num_surrogates;
3753#endif
3754 wchar_t *w;
3755 wchar_t *wchar_end;
3756
3757 if (!PyUnicode_Check(unicode)) {
3758 PyErr_BadArgument();
3759 return NULL;
3760 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003761 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003762 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003763 assert(_PyUnicode_KIND(unicode) != 0);
3764 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003765
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003766 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003767#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003768 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3769 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003770 num_surrogates = 0;
3771
3772 for (; four_bytes < ucs4_end; ++four_bytes) {
3773 if (*four_bytes > 0xFFFF)
3774 ++num_surrogates;
3775 }
3776
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003777 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3778 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3779 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003780 PyErr_NoMemory();
3781 return NULL;
3782 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003783 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003784
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003785 w = _PyUnicode_WSTR(unicode);
3786 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3787 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003788 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3789 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003790 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003791 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003792 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3793 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003794 }
3795 else
3796 *w = *four_bytes;
3797
3798 if (w > wchar_end) {
3799 assert(0 && "Miscalculated string end");
3800 }
3801 }
3802 *w = 0;
3803#else
3804 /* sizeof(wchar_t) == 4 */
3805 Py_FatalError("Impossible unicode object state, wstr and str "
3806 "should share memory already.");
3807 return NULL;
3808#endif
3809 }
3810 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003811 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3812 (_PyUnicode_LENGTH(unicode) + 1));
3813 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814 PyErr_NoMemory();
3815 return NULL;
3816 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003817 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3818 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3819 w = _PyUnicode_WSTR(unicode);
3820 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003821
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003822 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3823 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003824 for (; w < wchar_end; ++one_byte, ++w)
3825 *w = *one_byte;
3826 /* null-terminate the wstr */
3827 *w = 0;
3828 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003829 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003830#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003831 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003832 for (; w < wchar_end; ++two_bytes, ++w)
3833 *w = *two_bytes;
3834 /* null-terminate the wstr */
3835 *w = 0;
3836#else
3837 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003838 PyObject_FREE(_PyUnicode_WSTR(unicode));
3839 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840 Py_FatalError("Impossible unicode object state, wstr "
3841 "and str should share memory already.");
3842 return NULL;
3843#endif
3844 }
3845 else {
3846 assert(0 && "This should never happen.");
3847 }
3848 }
3849 }
3850 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003851 *size = PyUnicode_WSTR_LENGTH(unicode);
3852 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003853}
3854
Alexander Belopolsky40018472011-02-26 01:02:56 +00003855Py_UNICODE *
3856PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003858 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003859}
3860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003861
Alexander Belopolsky40018472011-02-26 01:02:56 +00003862Py_ssize_t
3863PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864{
3865 if (!PyUnicode_Check(unicode)) {
3866 PyErr_BadArgument();
3867 goto onError;
3868 }
3869 return PyUnicode_GET_SIZE(unicode);
3870
Benjamin Peterson29060642009-01-31 22:14:21 +00003871 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872 return -1;
3873}
3874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875Py_ssize_t
3876PyUnicode_GetLength(PyObject *unicode)
3877{
Victor Stinner07621332012-06-16 04:53:46 +02003878 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879 PyErr_BadArgument();
3880 return -1;
3881 }
Victor Stinner07621332012-06-16 04:53:46 +02003882 if (PyUnicode_READY(unicode) == -1)
3883 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884 return PyUnicode_GET_LENGTH(unicode);
3885}
3886
3887Py_UCS4
3888PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3889{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003890 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3891 PyErr_BadArgument();
3892 return (Py_UCS4)-1;
3893 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003894 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003895 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003896 return (Py_UCS4)-1;
3897 }
3898 return PyUnicode_READ_CHAR(unicode, index);
3899}
3900
3901int
3902PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3903{
3904 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003905 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003906 return -1;
3907 }
Victor Stinner488fa492011-12-12 00:01:39 +01003908 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003909 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003910 PyErr_SetString(PyExc_IndexError, "string index out of range");
3911 return -1;
3912 }
Victor Stinner488fa492011-12-12 00:01:39 +01003913 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003914 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003915 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3916 PyErr_SetString(PyExc_ValueError, "character out of range");
3917 return -1;
3918 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3920 index, ch);
3921 return 0;
3922}
3923
Alexander Belopolsky40018472011-02-26 01:02:56 +00003924const char *
3925PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003926{
Victor Stinner42cb4622010-09-01 19:39:01 +00003927 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003928}
3929
Victor Stinner554f3f02010-06-16 23:33:54 +00003930/* create or adjust a UnicodeDecodeError */
3931static void
3932make_decode_exception(PyObject **exceptionObject,
3933 const char *encoding,
3934 const char *input, Py_ssize_t length,
3935 Py_ssize_t startpos, Py_ssize_t endpos,
3936 const char *reason)
3937{
3938 if (*exceptionObject == NULL) {
3939 *exceptionObject = PyUnicodeDecodeError_Create(
3940 encoding, input, length, startpos, endpos, reason);
3941 }
3942 else {
3943 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3944 goto onError;
3945 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3946 goto onError;
3947 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3948 goto onError;
3949 }
3950 return;
3951
3952onError:
3953 Py_DECREF(*exceptionObject);
3954 *exceptionObject = NULL;
3955}
3956
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957/* error handling callback helper:
3958 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003959 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 and adjust various state variables.
3961 return 0 on success, -1 on error
3962*/
3963
Alexander Belopolsky40018472011-02-26 01:02:56 +00003964static int
3965unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003966 const char *encoding, const char *reason,
3967 const char **input, const char **inend, Py_ssize_t *startinpos,
3968 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003969 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003970{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003971 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972
3973 PyObject *restuple = NULL;
3974 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003975 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003976 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003977 Py_ssize_t requiredsize;
3978 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003979 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980 int res = -1;
3981
Victor Stinner596a6c42011-11-09 00:02:18 +01003982 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3983 outsize = PyUnicode_GET_LENGTH(*output);
3984 else
3985 outsize = _PyUnicode_WSTR_LENGTH(*output);
3986
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003987 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003988 *errorHandler = PyCodec_LookupError(errors);
3989 if (*errorHandler == NULL)
3990 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003991 }
3992
Victor Stinner554f3f02010-06-16 23:33:54 +00003993 make_decode_exception(exceptionObject,
3994 encoding,
3995 *input, *inend - *input,
3996 *startinpos, *endinpos,
3997 reason);
3998 if (*exceptionObject == NULL)
3999 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004000
4001 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4002 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004003 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004004 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004005 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004006 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004007 }
4008 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004009 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004010 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004011 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004012
4013 /* Copy back the bytes variables, which might have been modified by the
4014 callback */
4015 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4016 if (!inputobj)
4017 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004018 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004019 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004020 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004021 *input = PyBytes_AS_STRING(inputobj);
4022 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004023 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004024 /* we can DECREF safely, as the exception has another reference,
4025 so the object won't go away. */
4026 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004027
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004028 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004029 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004030 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004031 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4032 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004033 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004034
Victor Stinner596a6c42011-11-09 00:02:18 +01004035 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4036 /* need more space? (at least enough for what we
4037 have+the replacement+the rest of the string (starting
4038 at the new input position), so we won't have to check space
4039 when there are no errors in the rest of the string) */
4040 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4041 requiredsize = *outpos + replen + insize-newpos;
4042 if (requiredsize > outsize) {
4043 if (requiredsize<2*outsize)
4044 requiredsize = 2*outsize;
4045 if (unicode_resize(output, requiredsize) < 0)
4046 goto onError;
4047 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004048 if (unicode_widen(output, *outpos,
4049 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004050 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004051 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004052 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004053 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004054 else {
4055 wchar_t *repwstr;
4056 Py_ssize_t repwlen;
4057 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4058 if (repwstr == NULL)
4059 goto onError;
4060 /* need more space? (at least enough for what we
4061 have+the replacement+the rest of the string (starting
4062 at the new input position), so we won't have to check space
4063 when there are no errors in the rest of the string) */
4064 requiredsize = *outpos + repwlen + insize-newpos;
4065 if (requiredsize > outsize) {
4066 if (requiredsize < 2*outsize)
4067 requiredsize = 2*outsize;
4068 if (unicode_resize(output, requiredsize) < 0)
4069 goto onError;
4070 }
4071 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4072 *outpos += repwlen;
4073 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004074 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004075 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004076
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004077 /* we made it! */
4078 res = 0;
4079
Benjamin Peterson29060642009-01-31 22:14:21 +00004080 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004081 Py_XDECREF(restuple);
4082 return res;
4083}
4084
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004085/* --- UTF-7 Codec -------------------------------------------------------- */
4086
Antoine Pitrou244651a2009-05-04 18:56:13 +00004087/* See RFC2152 for details. We encode conservatively and decode liberally. */
4088
4089/* Three simple macros defining base-64. */
4090
4091/* Is c a base-64 character? */
4092
4093#define IS_BASE64(c) \
4094 (((c) >= 'A' && (c) <= 'Z') || \
4095 ((c) >= 'a' && (c) <= 'z') || \
4096 ((c) >= '0' && (c) <= '9') || \
4097 (c) == '+' || (c) == '/')
4098
4099/* given that c is a base-64 character, what is its base-64 value? */
4100
4101#define FROM_BASE64(c) \
4102 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4103 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4104 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4105 (c) == '+' ? 62 : 63)
4106
4107/* What is the base-64 character of the bottom 6 bits of n? */
4108
4109#define TO_BASE64(n) \
4110 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4111
4112/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4113 * decoded as itself. We are permissive on decoding; the only ASCII
4114 * byte not decoding to itself is the + which begins a base64
4115 * string. */
4116
4117#define DECODE_DIRECT(c) \
4118 ((c) <= 127 && (c) != '+')
4119
4120/* The UTF-7 encoder treats ASCII characters differently according to
4121 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4122 * the above). See RFC2152. This array identifies these different
4123 * sets:
4124 * 0 : "Set D"
4125 * alphanumeric and '(),-./:?
4126 * 1 : "Set O"
4127 * !"#$%&*;<=>@[]^_`{|}
4128 * 2 : "whitespace"
4129 * ht nl cr sp
4130 * 3 : special (must be base64 encoded)
4131 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4132 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004133
Tim Petersced69f82003-09-16 20:30:58 +00004134static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004135char utf7_category[128] = {
4136/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4137 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4138/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4139 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4140/* sp ! " # $ % & ' ( ) * + , - . / */
4141 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4142/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4143 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4144/* @ A B C D E F G H I J K L M N O */
4145 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4146/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4147 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4148/* ` a b c d e f g h i j k l m n o */
4149 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4150/* p q r s t u v w x y z { | } ~ del */
4151 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004152};
4153
Antoine Pitrou244651a2009-05-04 18:56:13 +00004154/* ENCODE_DIRECT: this character should be encoded as itself. The
4155 * answer depends on whether we are encoding set O as itself, and also
4156 * on whether we are encoding whitespace as itself. RFC2152 makes it
4157 * clear that the answers to these questions vary between
4158 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004159
Antoine Pitrou244651a2009-05-04 18:56:13 +00004160#define ENCODE_DIRECT(c, directO, directWS) \
4161 ((c) < 128 && (c) > 0 && \
4162 ((utf7_category[(c)] == 0) || \
4163 (directWS && (utf7_category[(c)] == 2)) || \
4164 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004165
Alexander Belopolsky40018472011-02-26 01:02:56 +00004166PyObject *
4167PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004168 Py_ssize_t size,
4169 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004170{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004171 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4172}
4173
Antoine Pitrou244651a2009-05-04 18:56:13 +00004174/* The decoder. The only state we preserve is our read position,
4175 * i.e. how many characters we have consumed. So if we end in the
4176 * middle of a shift sequence we have to back off the read position
4177 * and the output to the beginning of the sequence, otherwise we lose
4178 * all the shift state (seen bits, number of bits seen, high
4179 * surrogate). */
4180
Alexander Belopolsky40018472011-02-26 01:02:56 +00004181PyObject *
4182PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004183 Py_ssize_t size,
4184 const char *errors,
4185 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004186{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004187 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004188 Py_ssize_t startinpos;
4189 Py_ssize_t endinpos;
4190 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004191 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004192 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004193 const char *errmsg = "";
4194 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004195 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004196 unsigned int base64bits = 0;
4197 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004198 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199 PyObject *errorHandler = NULL;
4200 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004201
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004202 /* Start off assuming it's all ASCII. Widen later as necessary. */
4203 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004204 if (!unicode)
4205 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004206 if (size == 0) {
4207 if (consumed)
4208 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004209 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004210 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004211
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004212 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004213 e = s + size;
4214
4215 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004216 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004217 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004218 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004219
Antoine Pitrou244651a2009-05-04 18:56:13 +00004220 if (inShift) { /* in a base-64 section */
4221 if (IS_BASE64(ch)) { /* consume a base-64 character */
4222 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4223 base64bits += 6;
4224 s++;
4225 if (base64bits >= 16) {
4226 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004227 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004228 base64bits -= 16;
4229 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4230 if (surrogate) {
4231 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004232 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4233 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004234 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4235 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004236 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004237 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004238 }
4239 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004240 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4241 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004242 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004243 }
4244 }
Victor Stinner551ac952011-11-29 22:58:13 +01004245 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004246 /* first surrogate */
4247 surrogate = outCh;
4248 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004249 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004250 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4251 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004252 }
4253 }
4254 }
4255 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004256 inShift = 0;
4257 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004258 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004259 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4260 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004261 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004262 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004263 if (base64bits > 0) { /* left-over bits */
4264 if (base64bits >= 6) {
4265 /* We've seen at least one base-64 character */
4266 errmsg = "partial character in shift sequence";
4267 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004268 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004269 else {
4270 /* Some bits remain; they should be zero */
4271 if (base64buffer != 0) {
4272 errmsg = "non-zero padding bits in shift sequence";
4273 goto utf7Error;
4274 }
4275 }
4276 }
4277 if (ch != '-') {
4278 /* '-' is absorbed; other terminating
4279 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004280 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4281 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004282 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004283 }
4284 }
4285 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004286 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004287 s++; /* consume '+' */
4288 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004289 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004290 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4291 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004292 }
4293 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004294 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004295 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004296 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004297 }
4298 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004299 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004300 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4301 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004302 s++;
4303 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004304 else {
4305 startinpos = s-starts;
4306 s++;
4307 errmsg = "unexpected special character";
4308 goto utf7Error;
4309 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004310 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004311utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004312 endinpos = s-starts;
4313 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004314 errors, &errorHandler,
4315 "utf7", errmsg,
4316 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004317 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004319 }
4320
Antoine Pitrou244651a2009-05-04 18:56:13 +00004321 /* end of string */
4322
4323 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4324 /* if we're in an inconsistent state, that's an error */
4325 if (surrogate ||
4326 (base64bits >= 6) ||
4327 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004328 endinpos = size;
4329 if (unicode_decode_call_errorhandler(
4330 errors, &errorHandler,
4331 "utf7", "unterminated shift sequence",
4332 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004333 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004334 goto onError;
4335 if (s < e)
4336 goto restart;
4337 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004338 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004339
4340 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004341 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004342 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004343 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004344 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004345 }
4346 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004347 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004348 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004349 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004350
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004351 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004352 goto onError;
4353
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004354 Py_XDECREF(errorHandler);
4355 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004356 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004357
Benjamin Peterson29060642009-01-31 22:14:21 +00004358 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004359 Py_XDECREF(errorHandler);
4360 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004361 Py_DECREF(unicode);
4362 return NULL;
4363}
4364
4365
Alexander Belopolsky40018472011-02-26 01:02:56 +00004366PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004367_PyUnicode_EncodeUTF7(PyObject *str,
4368 int base64SetO,
4369 int base64WhiteSpace,
4370 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004371{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004372 int kind;
4373 void *data;
4374 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004375 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004376 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004377 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004378 unsigned int base64bits = 0;
4379 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004380 char * out;
4381 char * start;
4382
Benjamin Petersonbac79492012-01-14 13:34:47 -05004383 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004384 return NULL;
4385 kind = PyUnicode_KIND(str);
4386 data = PyUnicode_DATA(str);
4387 len = PyUnicode_GET_LENGTH(str);
4388
4389 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004390 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004391
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004392 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004393 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004394 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004395 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004396 if (v == NULL)
4397 return NULL;
4398
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004399 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004400 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004401 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004402
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403 if (inShift) {
4404 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4405 /* shifting out */
4406 if (base64bits) { /* output remaining bits */
4407 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4408 base64buffer = 0;
4409 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 }
4411 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004412 /* Characters not in the BASE64 set implicitly unshift the sequence
4413 so no '-' is required, except if the character is itself a '-' */
4414 if (IS_BASE64(ch) || ch == '-') {
4415 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004417 *out++ = (char) ch;
4418 }
4419 else {
4420 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004421 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004422 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423 else { /* not in a shift sequence */
4424 if (ch == '+') {
4425 *out++ = '+';
4426 *out++ = '-';
4427 }
4428 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4429 *out++ = (char) ch;
4430 }
4431 else {
4432 *out++ = '+';
4433 inShift = 1;
4434 goto encode_char;
4435 }
4436 }
4437 continue;
4438encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004439 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004440 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004441
Antoine Pitrou244651a2009-05-04 18:56:13 +00004442 /* code first surrogate */
4443 base64bits += 16;
4444 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4445 while (base64bits >= 6) {
4446 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4447 base64bits -= 6;
4448 }
4449 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004450 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 base64bits += 16;
4453 base64buffer = (base64buffer << 16) | ch;
4454 while (base64bits >= 6) {
4455 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4456 base64bits -= 6;
4457 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004458 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 if (base64bits)
4460 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4461 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004462 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004463 if (_PyBytes_Resize(&v, out - start) < 0)
4464 return NULL;
4465 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004466}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004467PyObject *
4468PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4469 Py_ssize_t size,
4470 int base64SetO,
4471 int base64WhiteSpace,
4472 const char *errors)
4473{
4474 PyObject *result;
4475 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4476 if (tmp == NULL)
4477 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004478 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004479 base64WhiteSpace, errors);
4480 Py_DECREF(tmp);
4481 return result;
4482}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004483
Antoine Pitrou244651a2009-05-04 18:56:13 +00004484#undef IS_BASE64
4485#undef FROM_BASE64
4486#undef TO_BASE64
4487#undef DECODE_DIRECT
4488#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004489
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490/* --- UTF-8 Codec -------------------------------------------------------- */
4491
Alexander Belopolsky40018472011-02-26 01:02:56 +00004492PyObject *
4493PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004494 Py_ssize_t size,
4495 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496{
Walter Dörwald69652032004-09-07 20:24:22 +00004497 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4498}
4499
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004500#include "stringlib/asciilib.h"
4501#include "stringlib/codecs.h"
4502#include "stringlib/undef.h"
4503
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004504#include "stringlib/ucs1lib.h"
4505#include "stringlib/codecs.h"
4506#include "stringlib/undef.h"
4507
4508#include "stringlib/ucs2lib.h"
4509#include "stringlib/codecs.h"
4510#include "stringlib/undef.h"
4511
4512#include "stringlib/ucs4lib.h"
4513#include "stringlib/codecs.h"
4514#include "stringlib/undef.h"
4515
Antoine Pitrouab868312009-01-10 15:40:25 +00004516/* Mask to quickly check whether a C 'long' contains a
4517 non-ASCII, UTF8-encoded char. */
4518#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004519# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004520#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004521# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004522#else
4523# error C 'long' size should be either 4 or 8!
4524#endif
4525
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004526static Py_ssize_t
4527ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004528{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004529 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004530 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004531
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004532#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004533 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4534 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004535 /* Fast path, see in STRINGLIB(utf8_decode) for
4536 an explanation. */
4537 /* Help register allocation */
4538 register const char *_p = p;
4539 register Py_UCS1 * q = dest;
4540 while (_p < aligned_end) {
4541 unsigned long value = *(const unsigned long *) _p;
4542 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004543 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004544 *((unsigned long *)q) = value;
4545 _p += SIZEOF_LONG;
4546 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004547 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004548 p = _p;
4549 while (p < end) {
4550 if ((unsigned char)*p & 0x80)
4551 break;
4552 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004553 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004554 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004556#endif
4557 while (p < end) {
4558 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4559 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004560 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004561 /* Help register allocation */
4562 register const char *_p = p;
4563 while (_p < aligned_end) {
4564 unsigned long value = *(unsigned long *) _p;
4565 if (value & ASCII_CHAR_MASK)
4566 break;
4567 _p += SIZEOF_LONG;
4568 }
4569 p = _p;
4570 if (_p == end)
4571 break;
4572 }
4573 if ((unsigned char)*p & 0x80)
4574 break;
4575 ++p;
4576 }
4577 memcpy(dest, start, p - start);
4578 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579}
Antoine Pitrouab868312009-01-10 15:40:25 +00004580
Victor Stinner785938e2011-12-11 20:09:03 +01004581PyObject *
4582PyUnicode_DecodeUTF8Stateful(const char *s,
4583 Py_ssize_t size,
4584 const char *errors,
4585 Py_ssize_t *consumed)
4586{
Victor Stinner785938e2011-12-11 20:09:03 +01004587 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004588 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004589 const char *end = s + size;
4590 Py_ssize_t outpos;
4591
4592 Py_ssize_t startinpos;
4593 Py_ssize_t endinpos;
4594 const char *errmsg = "";
4595 PyObject *errorHandler = NULL;
4596 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004597
4598 if (size == 0) {
4599 if (consumed)
4600 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004601 Py_INCREF(unicode_empty);
4602 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004603 }
4604
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004605 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4606 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004607 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004608 *consumed = 1;
4609 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004610 }
4611
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004612 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004613 if (!unicode)
4614 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004615
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004616 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4617 s += outpos;
4618 while (s < end) {
4619 Py_UCS4 ch;
4620 int kind = PyUnicode_KIND(unicode);
4621 if (kind == PyUnicode_1BYTE_KIND) {
4622 if (PyUnicode_IS_ASCII(unicode))
4623 ch = asciilib_utf8_decode(&s, end,
4624 PyUnicode_1BYTE_DATA(unicode), &outpos);
4625 else
4626 ch = ucs1lib_utf8_decode(&s, end,
4627 PyUnicode_1BYTE_DATA(unicode), &outpos);
4628 } else if (kind == PyUnicode_2BYTE_KIND) {
4629 ch = ucs2lib_utf8_decode(&s, end,
4630 PyUnicode_2BYTE_DATA(unicode), &outpos);
4631 } else {
4632 assert(kind == PyUnicode_4BYTE_KIND);
4633 ch = ucs4lib_utf8_decode(&s, end,
4634 PyUnicode_4BYTE_DATA(unicode), &outpos);
4635 }
4636
4637 switch (ch) {
4638 case 0:
4639 if (s == end || consumed)
4640 goto End;
4641 errmsg = "unexpected end of data";
4642 startinpos = s - starts;
4643 endinpos = startinpos + 1;
4644 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4645 endinpos++;
4646 break;
4647 case 1:
4648 errmsg = "invalid start byte";
4649 startinpos = s - starts;
4650 endinpos = startinpos + 1;
4651 break;
4652 case 2:
4653 errmsg = "invalid continuation byte";
4654 startinpos = s - starts;
4655 endinpos = startinpos + 1;
4656 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4657 endinpos++;
4658 break;
4659 default:
4660 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4661 goto onError;
4662 continue;
4663 }
4664
4665 if (unicode_decode_call_errorhandler(
4666 errors, &errorHandler,
4667 "utf-8", errmsg,
4668 &starts, &end, &startinpos, &endinpos, &exc, &s,
4669 &unicode, &outpos))
4670 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004671 }
4672
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004673End:
4674 if (unicode_resize(&unicode, outpos) < 0)
4675 goto onError;
4676
4677 if (consumed)
4678 *consumed = s - starts;
4679
4680 Py_XDECREF(errorHandler);
4681 Py_XDECREF(exc);
4682 assert(_PyUnicode_CheckConsistency(unicode, 1));
4683 return unicode;
4684
4685onError:
4686 Py_XDECREF(errorHandler);
4687 Py_XDECREF(exc);
4688 Py_XDECREF(unicode);
4689 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004690}
4691
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004692#ifdef __APPLE__
4693
4694/* Simplified UTF-8 decoder using surrogateescape error handler,
4695 used to decode the command line arguments on Mac OS X. */
4696
4697wchar_t*
4698_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4699{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004700 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004701 wchar_t *unicode;
4702 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004703
4704 /* Note: size will always be longer than the resulting Unicode
4705 character count */
4706 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4707 PyErr_NoMemory();
4708 return NULL;
4709 }
4710 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4711 if (!unicode)
4712 return NULL;
4713
4714 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004715 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004716 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004717 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004718 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004719#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004720 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004721#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004723#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004724 if (ch > 0xFF) {
4725#if SIZEOF_WCHAR_T == 4
4726 assert(0);
4727#else
4728 assert(Py_UNICODE_IS_SURROGATE(ch));
4729 /* compute and append the two surrogates: */
4730 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4731 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4732#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004733 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004734 else {
4735 if (!ch && s == e)
4736 break;
4737 /* surrogateescape */
4738 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4739 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004740 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004741 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004742 return unicode;
4743}
4744
4745#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004747/* Primary internal function which creates utf8 encoded bytes objects.
4748
4749 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004750 and allocate exactly as much space needed at the end. Else allocate the
4751 maximum possible needed (4 result bytes per Unicode character), and return
4752 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004753*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004754PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004755_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756{
Victor Stinner6099a032011-12-18 14:22:26 +01004757 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004758 void *data;
4759 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004760
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004761 if (!PyUnicode_Check(unicode)) {
4762 PyErr_BadArgument();
4763 return NULL;
4764 }
4765
4766 if (PyUnicode_READY(unicode) == -1)
4767 return NULL;
4768
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004769 if (PyUnicode_UTF8(unicode))
4770 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4771 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004772
4773 kind = PyUnicode_KIND(unicode);
4774 data = PyUnicode_DATA(unicode);
4775 size = PyUnicode_GET_LENGTH(unicode);
4776
Benjamin Petersonead6b532011-12-20 17:23:42 -06004777 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004778 default:
4779 assert(0);
4780 case PyUnicode_1BYTE_KIND:
4781 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4782 assert(!PyUnicode_IS_ASCII(unicode));
4783 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4784 case PyUnicode_2BYTE_KIND:
4785 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4786 case PyUnicode_4BYTE_KIND:
4787 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004788 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789}
4790
Alexander Belopolsky40018472011-02-26 01:02:56 +00004791PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004792PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4793 Py_ssize_t size,
4794 const char *errors)
4795{
4796 PyObject *v, *unicode;
4797
4798 unicode = PyUnicode_FromUnicode(s, size);
4799 if (unicode == NULL)
4800 return NULL;
4801 v = _PyUnicode_AsUTF8String(unicode, errors);
4802 Py_DECREF(unicode);
4803 return v;
4804}
4805
4806PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004807PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004808{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004809 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810}
4811
Walter Dörwald41980ca2007-08-16 21:55:45 +00004812/* --- UTF-32 Codec ------------------------------------------------------- */
4813
4814PyObject *
4815PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004816 Py_ssize_t size,
4817 const char *errors,
4818 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004819{
4820 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4821}
4822
4823PyObject *
4824PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004825 Py_ssize_t size,
4826 const char *errors,
4827 int *byteorder,
4828 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004829{
4830 const char *starts = s;
4831 Py_ssize_t startinpos;
4832 Py_ssize_t endinpos;
4833 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004834 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004835 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004836 int bo = 0; /* assume native ordering by default */
4837 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004838 /* Offsets from q for retrieving bytes in the right order. */
4839#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4840 int iorder[] = {0, 1, 2, 3};
4841#else
4842 int iorder[] = {3, 2, 1, 0};
4843#endif
4844 PyObject *errorHandler = NULL;
4845 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004846
Walter Dörwald41980ca2007-08-16 21:55:45 +00004847 q = (unsigned char *)s;
4848 e = q + size;
4849
4850 if (byteorder)
4851 bo = *byteorder;
4852
4853 /* Check for BOM marks (U+FEFF) in the input and adjust current
4854 byte order setting accordingly. In native mode, the leading BOM
4855 mark is skipped, in all other modes, it is copied to the output
4856 stream as-is (giving a ZWNBSP character). */
4857 if (bo == 0) {
4858 if (size >= 4) {
4859 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004860 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004861#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004862 if (bom == 0x0000FEFF) {
4863 q += 4;
4864 bo = -1;
4865 }
4866 else if (bom == 0xFFFE0000) {
4867 q += 4;
4868 bo = 1;
4869 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004870#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004871 if (bom == 0x0000FEFF) {
4872 q += 4;
4873 bo = 1;
4874 }
4875 else if (bom == 0xFFFE0000) {
4876 q += 4;
4877 bo = -1;
4878 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004879#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004880 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004881 }
4882
4883 if (bo == -1) {
4884 /* force LE */
4885 iorder[0] = 0;
4886 iorder[1] = 1;
4887 iorder[2] = 2;
4888 iorder[3] = 3;
4889 }
4890 else if (bo == 1) {
4891 /* force BE */
4892 iorder[0] = 3;
4893 iorder[1] = 2;
4894 iorder[2] = 1;
4895 iorder[3] = 0;
4896 }
4897
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004898 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004899 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004900 if (!unicode)
4901 return NULL;
4902 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01004903 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004904 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004905
Walter Dörwald41980ca2007-08-16 21:55:45 +00004906 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 Py_UCS4 ch;
4908 /* remaining bytes at the end? (size should be divisible by 4) */
4909 if (e-q<4) {
4910 if (consumed)
4911 break;
4912 errmsg = "truncated data";
4913 startinpos = ((const char *)q)-starts;
4914 endinpos = ((const char *)e)-starts;
4915 goto utf32Error;
4916 /* The remaining input chars are ignored if the callback
4917 chooses to skip the input */
4918 }
4919 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4920 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004921
Benjamin Peterson29060642009-01-31 22:14:21 +00004922 if (ch >= 0x110000)
4923 {
4924 errmsg = "codepoint not in range(0x110000)";
4925 startinpos = ((const char *)q)-starts;
4926 endinpos = startinpos+4;
4927 goto utf32Error;
4928 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004929 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4930 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00004931 q += 4;
4932 continue;
4933 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004934 if (unicode_decode_call_errorhandler(
4935 errors, &errorHandler,
4936 "utf32", errmsg,
4937 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004938 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004939 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004940 }
4941
4942 if (byteorder)
4943 *byteorder = bo;
4944
4945 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004947
4948 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01004949 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950 goto onError;
4951
4952 Py_XDECREF(errorHandler);
4953 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004954 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004955
Benjamin Peterson29060642009-01-31 22:14:21 +00004956 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957 Py_DECREF(unicode);
4958 Py_XDECREF(errorHandler);
4959 Py_XDECREF(exc);
4960 return NULL;
4961}
4962
4963PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004964_PyUnicode_EncodeUTF32(PyObject *str,
4965 const char *errors,
4966 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004967{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004968 int kind;
4969 void *data;
4970 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004971 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004972 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004973 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004974 /* Offsets from p for storing byte pairs in the right order. */
4975#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4976 int iorder[] = {0, 1, 2, 3};
4977#else
4978 int iorder[] = {3, 2, 1, 0};
4979#endif
4980
Benjamin Peterson29060642009-01-31 22:14:21 +00004981#define STORECHAR(CH) \
4982 do { \
4983 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4984 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4985 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4986 p[iorder[0]] = (CH) & 0xff; \
4987 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004988 } while(0)
4989
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004990 if (!PyUnicode_Check(str)) {
4991 PyErr_BadArgument();
4992 return NULL;
4993 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004994 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004995 return NULL;
4996 kind = PyUnicode_KIND(str);
4997 data = PyUnicode_DATA(str);
4998 len = PyUnicode_GET_LENGTH(str);
4999
5000 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005001 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005002 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005003 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005004 if (v == NULL)
5005 return NULL;
5006
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005007 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005008 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005010 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005011 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005012
5013 if (byteorder == -1) {
5014 /* force LE */
5015 iorder[0] = 0;
5016 iorder[1] = 1;
5017 iorder[2] = 2;
5018 iorder[3] = 3;
5019 }
5020 else if (byteorder == 1) {
5021 /* force BE */
5022 iorder[0] = 3;
5023 iorder[1] = 2;
5024 iorder[2] = 1;
5025 iorder[3] = 0;
5026 }
5027
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005028 for (i = 0; i < len; i++)
5029 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005030
5031 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005032 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005033#undef STORECHAR
5034}
5035
Alexander Belopolsky40018472011-02-26 01:02:56 +00005036PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005037PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5038 Py_ssize_t size,
5039 const char *errors,
5040 int byteorder)
5041{
5042 PyObject *result;
5043 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5044 if (tmp == NULL)
5045 return NULL;
5046 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5047 Py_DECREF(tmp);
5048 return result;
5049}
5050
5051PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005052PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005053{
Victor Stinnerb960b342011-11-20 19:12:52 +01005054 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005055}
5056
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057/* --- UTF-16 Codec ------------------------------------------------------- */
5058
Tim Peters772747b2001-08-09 22:21:55 +00005059PyObject *
5060PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005061 Py_ssize_t size,
5062 const char *errors,
5063 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064{
Walter Dörwald69652032004-09-07 20:24:22 +00005065 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5066}
5067
5068PyObject *
5069PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005070 Py_ssize_t size,
5071 const char *errors,
5072 int *byteorder,
5073 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005074{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005075 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005076 Py_ssize_t startinpos;
5077 Py_ssize_t endinpos;
5078 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005079 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005080 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005081 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005082 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005083 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005084 PyObject *errorHandler = NULL;
5085 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086
Tim Peters772747b2001-08-09 22:21:55 +00005087 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005088 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089
5090 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005091 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005093 /* Check for BOM marks (U+FEFF) in the input and adjust current
5094 byte order setting accordingly. In native mode, the leading BOM
5095 mark is skipped, in all other modes, it is copied to the output
5096 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005097 if (bo == 0 && size >= 2) {
5098 const Py_UCS4 bom = (q[1] << 8) | q[0];
5099 if (bom == 0xFEFF) {
5100 q += 2;
5101 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005102 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005103 else if (bom == 0xFFFE) {
5104 q += 2;
5105 bo = 1;
5106 }
5107 if (byteorder)
5108 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005109 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110
Antoine Pitrou63065d72012-05-15 23:48:04 +02005111 if (q == e) {
5112 if (consumed)
5113 *consumed = size;
5114 Py_INCREF(unicode_empty);
5115 return unicode_empty;
Tim Peters772747b2001-08-09 22:21:55 +00005116 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005117
Antoine Pitrouab868312009-01-10 15:40:25 +00005118#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005119 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005120#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005121 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005122#endif
Tim Peters772747b2001-08-09 22:21:55 +00005123
Antoine Pitrou63065d72012-05-15 23:48:04 +02005124 /* Note: size will always be longer than the resulting Unicode
5125 character count */
5126 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5127 if (!unicode)
5128 return NULL;
5129
5130 outpos = 0;
5131 while (1) {
5132 Py_UCS4 ch = 0;
5133 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005134 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005135 if (kind == PyUnicode_1BYTE_KIND) {
5136 if (PyUnicode_IS_ASCII(unicode))
5137 ch = asciilib_utf16_decode(&q, e,
5138 PyUnicode_1BYTE_DATA(unicode), &outpos,
5139 native_ordering);
5140 else
5141 ch = ucs1lib_utf16_decode(&q, e,
5142 PyUnicode_1BYTE_DATA(unicode), &outpos,
5143 native_ordering);
5144 } else if (kind == PyUnicode_2BYTE_KIND) {
5145 ch = ucs2lib_utf16_decode(&q, e,
5146 PyUnicode_2BYTE_DATA(unicode), &outpos,
5147 native_ordering);
5148 } else {
5149 assert(kind == PyUnicode_4BYTE_KIND);
5150 ch = ucs4lib_utf16_decode(&q, e,
5151 PyUnicode_4BYTE_DATA(unicode), &outpos,
5152 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005153 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005154 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005155
Antoine Pitrou63065d72012-05-15 23:48:04 +02005156 switch (ch)
5157 {
5158 case 0:
5159 /* remaining byte at the end? (size should be even) */
5160 if (q == e || consumed)
5161 goto End;
5162 errmsg = "truncated data";
5163 startinpos = ((const char *)q) - starts;
5164 endinpos = ((const char *)e) - starts;
5165 break;
5166 /* The remaining input chars are ignored if the callback
5167 chooses to skip the input */
5168 case 1:
5169 errmsg = "unexpected end of data";
5170 startinpos = ((const char *)q) - 2 - starts;
5171 endinpos = ((const char *)e) - starts;
5172 break;
5173 case 2:
5174 errmsg = "illegal encoding";
5175 startinpos = ((const char *)q) - 2 - starts;
5176 endinpos = startinpos + 2;
5177 break;
5178 case 3:
5179 errmsg = "illegal UTF-16 surrogate";
5180 startinpos = ((const char *)q) - 4 - starts;
5181 endinpos = startinpos + 2;
5182 break;
5183 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005184 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5185 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005186 continue;
5187 }
5188
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005190 errors,
5191 &errorHandler,
5192 "utf16", errmsg,
5193 &starts,
5194 (const char **)&e,
5195 &startinpos,
5196 &endinpos,
5197 &exc,
5198 (const char **)&q,
5199 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005200 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005201 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 }
5203
Antoine Pitrou63065d72012-05-15 23:48:04 +02005204End:
Walter Dörwald69652032004-09-07 20:24:22 +00005205 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005206 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005207
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005209 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 goto onError;
5211
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005212 Py_XDECREF(errorHandler);
5213 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005214 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
Benjamin Peterson29060642009-01-31 22:14:21 +00005216 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005218 Py_XDECREF(errorHandler);
5219 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 return NULL;
5221}
5222
Tim Peters772747b2001-08-09 22:21:55 +00005223PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005224_PyUnicode_EncodeUTF16(PyObject *str,
5225 const char *errors,
5226 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005228 enum PyUnicode_Kind kind;
5229 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005230 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005231 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005232 unsigned short *out;
5233 Py_ssize_t bytesize;
5234 Py_ssize_t pairs;
5235#ifdef WORDS_BIGENDIAN
5236 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005237#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005238 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005239#endif
5240
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005241 if (!PyUnicode_Check(str)) {
5242 PyErr_BadArgument();
5243 return NULL;
5244 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005245 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005246 return NULL;
5247 kind = PyUnicode_KIND(str);
5248 data = PyUnicode_DATA(str);
5249 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005250
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005251 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005252 if (kind == PyUnicode_4BYTE_KIND) {
5253 const Py_UCS4 *in = (const Py_UCS4 *)data;
5254 const Py_UCS4 *end = in + len;
5255 while (in < end)
5256 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005257 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005258 }
5259 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005260 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005261 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005262 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263 if (v == NULL)
5264 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005266 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005267 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005268 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005270 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005271 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005272 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005273
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005274 switch (kind) {
5275 case PyUnicode_1BYTE_KIND: {
5276 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5277 break;
Tim Peters772747b2001-08-09 22:21:55 +00005278 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005279 case PyUnicode_2BYTE_KIND: {
5280 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5281 break;
Tim Peters772747b2001-08-09 22:21:55 +00005282 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005283 case PyUnicode_4BYTE_KIND: {
5284 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5285 break;
5286 }
5287 default:
5288 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005289 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005290
5291 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005292 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005293}
5294
Alexander Belopolsky40018472011-02-26 01:02:56 +00005295PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005296PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5297 Py_ssize_t size,
5298 const char *errors,
5299 int byteorder)
5300{
5301 PyObject *result;
5302 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5303 if (tmp == NULL)
5304 return NULL;
5305 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5306 Py_DECREF(tmp);
5307 return result;
5308}
5309
5310PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005311PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005313 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314}
5315
5316/* --- Unicode Escape Codec ----------------------------------------------- */
5317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005318/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5319 if all the escapes in the string make it still a valid ASCII string.
5320 Returns -1 if any escapes were found which cause the string to
5321 pop out of ASCII range. Otherwise returns the length of the
5322 required buffer to hold the string.
5323 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005324static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005325length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5326{
5327 const unsigned char *p = (const unsigned char *)s;
5328 const unsigned char *end = p + size;
5329 Py_ssize_t length = 0;
5330
5331 if (size < 0)
5332 return -1;
5333
5334 for (; p < end; ++p) {
5335 if (*p > 127) {
5336 /* Non-ASCII */
5337 return -1;
5338 }
5339 else if (*p != '\\') {
5340 /* Normal character */
5341 ++length;
5342 }
5343 else {
5344 /* Backslash-escape, check next char */
5345 ++p;
5346 /* Escape sequence reaches till end of string or
5347 non-ASCII follow-up. */
5348 if (p >= end || *p > 127)
5349 return -1;
5350 switch (*p) {
5351 case '\n':
5352 /* backslash + \n result in zero characters */
5353 break;
5354 case '\\': case '\'': case '\"':
5355 case 'b': case 'f': case 't':
5356 case 'n': case 'r': case 'v': case 'a':
5357 ++length;
5358 break;
5359 case '0': case '1': case '2': case '3':
5360 case '4': case '5': case '6': case '7':
5361 case 'x': case 'u': case 'U': case 'N':
5362 /* these do not guarantee ASCII characters */
5363 return -1;
5364 default:
5365 /* count the backslash + the other character */
5366 length += 2;
5367 }
5368 }
5369 }
5370 return length;
5371}
5372
Fredrik Lundh06d12682001-01-24 07:59:11 +00005373static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005374
Alexander Belopolsky40018472011-02-26 01:02:56 +00005375PyObject *
5376PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005377 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005378 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005380 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005381 Py_ssize_t startinpos;
5382 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005383 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005384 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005386 char* message;
5387 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005388 PyObject *errorHandler = NULL;
5389 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005390 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005391 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005392
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005393 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005394
5395 /* After length_of_escaped_ascii_string() there are two alternatives,
5396 either the string is pure ASCII with named escapes like \n, etc.
5397 and we determined it's exact size (common case)
5398 or it contains \x, \u, ... escape sequences. then we create a
5399 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005400 if (len >= 0) {
5401 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005402 if (!v)
5403 goto onError;
5404 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005405 }
5406 else {
5407 /* Escaped strings will always be longer than the resulting
5408 Unicode string, so we start with size here and then reduce the
5409 length after conversion to the true value.
5410 (but if the error callback returns a long replacement string
5411 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005412 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005413 if (!v)
5414 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005415 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005416 }
5417
Guido van Rossumd57fd912000-03-10 22:53:23 +00005418 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005419 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005420 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005422
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423 while (s < end) {
5424 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005425 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005426 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005428 /* The only case in which i == ascii_length is a backslash
5429 followed by a newline. */
5430 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005431
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432 /* Non-escape characters are interpreted as Unicode ordinals */
5433 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005434 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5435 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 continue;
5437 }
5438
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005439 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 /* \ - Escapes */
5441 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005442 c = *s++;
5443 if (s > end)
5444 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005445
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005446 /* The only case in which i == ascii_length is a backslash
5447 followed by a newline. */
5448 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005449
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005450 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451
Benjamin Peterson29060642009-01-31 22:14:21 +00005452 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005453#define WRITECHAR(ch) \
5454 do { \
5455 if (unicode_putchar(&v, &i, ch) < 0) \
5456 goto onError; \
5457 }while(0)
5458
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005460 case '\\': WRITECHAR('\\'); break;
5461 case '\'': WRITECHAR('\''); break;
5462 case '\"': WRITECHAR('\"'); break;
5463 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005464 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005465 case 'f': WRITECHAR('\014'); break;
5466 case 't': WRITECHAR('\t'); break;
5467 case 'n': WRITECHAR('\n'); break;
5468 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005469 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005470 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005471 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005472 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473
Benjamin Peterson29060642009-01-31 22:14:21 +00005474 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 case '0': case '1': case '2': case '3':
5476 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005477 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005478 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005479 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005480 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005481 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005483 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 break;
5485
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 /* hex escapes */
5487 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005489 digits = 2;
5490 message = "truncated \\xXX escape";
5491 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005495 digits = 4;
5496 message = "truncated \\uXXXX escape";
5497 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005500 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005501 digits = 8;
5502 message = "truncated \\UXXXXXXXX escape";
5503 hexescape:
5504 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005505 if (s+digits>end) {
5506 endinpos = size;
5507 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 errors, &errorHandler,
5509 "unicodeescape", "end of string in escape sequence",
5510 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005511 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 goto onError;
5513 goto nextByte;
5514 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005515 for (j = 0; j < digits; ++j) {
5516 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005517 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005518 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005519 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005520 errors, &errorHandler,
5521 "unicodeescape", message,
5522 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005523 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005524 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005525 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005526 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005527 }
5528 chr = (chr<<4) & ~0xF;
5529 if (c >= '0' && c <= '9')
5530 chr += c - '0';
5531 else if (c >= 'a' && c <= 'f')
5532 chr += 10 + c - 'a';
5533 else
5534 chr += 10 + c - 'A';
5535 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005536 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005537 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005538 /* _decoding_error will have already written into the
5539 target buffer. */
5540 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005541 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005542 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005543 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005544 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005545 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005546 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005547 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005548 errors, &errorHandler,
5549 "unicodeescape", "illegal Unicode character",
5550 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005551 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005552 goto onError;
5553 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005554 break;
5555
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005557 case 'N':
5558 message = "malformed \\N character escape";
5559 if (ucnhash_CAPI == NULL) {
5560 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005561 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5562 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005563 if (ucnhash_CAPI == NULL)
5564 goto ucnhashError;
5565 }
5566 if (*s == '{') {
5567 const char *start = s+1;
5568 /* look for the closing brace */
5569 while (*s != '}' && s < end)
5570 s++;
5571 if (s > start && s < end && *s == '}') {
5572 /* found a name. look it up in the unicode database */
5573 message = "unknown Unicode character name";
5574 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005575 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005576 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005577 goto store;
5578 }
5579 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005581 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005582 errors, &errorHandler,
5583 "unicodeescape", message,
5584 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005585 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005586 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005587 break;
5588
5589 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005590 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005591 message = "\\ at end of string";
5592 s--;
5593 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005594 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005595 errors, &errorHandler,
5596 "unicodeescape", message,
5597 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005598 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005599 goto onError;
5600 }
5601 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005602 WRITECHAR('\\');
5603 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005604 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005605 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005607 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005608 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005610#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005611
Victor Stinner16e6a802011-12-12 13:24:15 +01005612 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005613 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005614 Py_XDECREF(errorHandler);
5615 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005616 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005617
Benjamin Peterson29060642009-01-31 22:14:21 +00005618 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005619 PyErr_SetString(
5620 PyExc_UnicodeError,
5621 "\\N escapes not supported (can't load unicodedata module)"
5622 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005623 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005624 Py_XDECREF(errorHandler);
5625 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005626 return NULL;
5627
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005630 Py_XDECREF(errorHandler);
5631 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 return NULL;
5633}
5634
5635/* Return a Unicode-Escape string version of the Unicode object.
5636
5637 If quotes is true, the string is enclosed in u"" or u'' quotes as
5638 appropriate.
5639
5640*/
5641
Alexander Belopolsky40018472011-02-26 01:02:56 +00005642PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005643PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005645 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005646 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005648 int kind;
5649 void *data;
5650 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651
Ezio Melottie7f90372012-10-05 03:33:31 +03005652 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005653 escape.
5654
Ezio Melottie7f90372012-10-05 03:33:31 +03005655 For UCS1 strings it's '\xxx', 4 bytes per source character.
5656 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5657 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005658 */
5659
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005660 if (!PyUnicode_Check(unicode)) {
5661 PyErr_BadArgument();
5662 return NULL;
5663 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005664 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005665 return NULL;
5666 len = PyUnicode_GET_LENGTH(unicode);
5667 kind = PyUnicode_KIND(unicode);
5668 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005669 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005670 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5671 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5672 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5673 }
5674
5675 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005676 return PyBytes_FromStringAndSize(NULL, 0);
5677
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005678 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005679 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005680
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005681 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005683 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005684 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 if (repr == NULL)
5686 return NULL;
5687
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005688 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005690 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005691 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005692
Walter Dörwald79e913e2007-05-12 11:08:06 +00005693 /* Escape backslashes */
5694 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 *p++ = '\\';
5696 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005697 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005698 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005699
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005700 /* Map 21-bit characters to '\U00xxxxxx' */
5701 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005702 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005703 *p++ = '\\';
5704 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005705 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5706 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5707 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5708 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5709 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5710 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5711 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5712 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005714 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005715
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005717 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 *p++ = '\\';
5719 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005720 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5721 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5722 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5723 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005725
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005726 /* Map special whitespace to '\t', \n', '\r' */
5727 else if (ch == '\t') {
5728 *p++ = '\\';
5729 *p++ = 't';
5730 }
5731 else if (ch == '\n') {
5732 *p++ = '\\';
5733 *p++ = 'n';
5734 }
5735 else if (ch == '\r') {
5736 *p++ = '\\';
5737 *p++ = 'r';
5738 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005739
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005740 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005741 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005743 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005744 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5745 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005746 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005747
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 /* Copy everything else as-is */
5749 else
5750 *p++ = (char) ch;
5751 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005753 assert(p - PyBytes_AS_STRING(repr) > 0);
5754 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5755 return NULL;
5756 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757}
5758
Alexander Belopolsky40018472011-02-26 01:02:56 +00005759PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005760PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5761 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005763 PyObject *result;
5764 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5765 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005767 result = PyUnicode_AsUnicodeEscapeString(tmp);
5768 Py_DECREF(tmp);
5769 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770}
5771
5772/* --- Raw Unicode Escape Codec ------------------------------------------- */
5773
Alexander Belopolsky40018472011-02-26 01:02:56 +00005774PyObject *
5775PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005776 Py_ssize_t size,
5777 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005779 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005780 Py_ssize_t startinpos;
5781 Py_ssize_t endinpos;
5782 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005783 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005784 const char *end;
5785 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005786 PyObject *errorHandler = NULL;
5787 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005788
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 /* Escaped strings will always be longer than the resulting
5790 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005791 length after conversion to the true value. (But decoding error
5792 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005793 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005795 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005797 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005798 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799 end = s + size;
5800 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005801 unsigned char c;
5802 Py_UCS4 x;
5803 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005804 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805
Benjamin Peterson29060642009-01-31 22:14:21 +00005806 /* Non-escape characters are interpreted as Unicode ordinals */
5807 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005808 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5809 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005810 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005811 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 startinpos = s-starts;
5813
5814 /* \u-escapes are only interpreted iff the number of leading
5815 backslashes if odd */
5816 bs = s;
5817 for (;s < end;) {
5818 if (*s != '\\')
5819 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005820 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5821 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005822 }
5823 if (((s - bs) & 1) == 0 ||
5824 s >= end ||
5825 (*s != 'u' && *s != 'U')) {
5826 continue;
5827 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005828 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 count = *s=='u' ? 4 : 8;
5830 s++;
5831
5832 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 for (x = 0, i = 0; i < count; ++i, ++s) {
5834 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005835 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 endinpos = s-starts;
5837 if (unicode_decode_call_errorhandler(
5838 errors, &errorHandler,
5839 "rawunicodeescape", "truncated \\uXXXX",
5840 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005841 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005842 goto onError;
5843 goto nextByte;
5844 }
5845 x = (x<<4) & ~0xF;
5846 if (c >= '0' && c <= '9')
5847 x += c - '0';
5848 else if (c >= 'a' && c <= 'f')
5849 x += 10 + c - 'a';
5850 else
5851 x += 10 + c - 'A';
5852 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005853 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005854 if (unicode_putchar(&v, &outpos, x) < 0)
5855 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005856 } else {
5857 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005858 if (unicode_decode_call_errorhandler(
5859 errors, &errorHandler,
5860 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005862 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005863 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005864 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 nextByte:
5866 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 }
Victor Stinner16e6a802011-12-12 13:24:15 +01005868 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005869 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005870 Py_XDECREF(errorHandler);
5871 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005872 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00005873
Benjamin Peterson29060642009-01-31 22:14:21 +00005874 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005876 Py_XDECREF(errorHandler);
5877 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 return NULL;
5879}
5880
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005881
Alexander Belopolsky40018472011-02-26 01:02:56 +00005882PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005883PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005885 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005886 char *p;
5887 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005888 Py_ssize_t expandsize, pos;
5889 int kind;
5890 void *data;
5891 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005892
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005893 if (!PyUnicode_Check(unicode)) {
5894 PyErr_BadArgument();
5895 return NULL;
5896 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005897 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005898 return NULL;
5899 kind = PyUnicode_KIND(unicode);
5900 data = PyUnicode_DATA(unicode);
5901 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06005902 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5903 bytes, and 1 byte characters 4. */
5904 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01005905
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005906 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005907 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00005908
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005909 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 if (repr == NULL)
5911 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005912 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005913 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005915 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005916 for (pos = 0; pos < len; pos++) {
5917 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 /* Map 32-bit characters to '\Uxxxxxxxx' */
5919 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005920 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005921 *p++ = '\\';
5922 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005923 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5924 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5925 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5926 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5927 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5928 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5929 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5930 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00005931 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005932 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005933 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005934 *p++ = '\\';
5935 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005936 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5937 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5938 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5939 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005941 /* Copy everything else as-is */
5942 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 *p++ = (char) ch;
5944 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005945
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005946 assert(p > q);
5947 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005948 return NULL;
5949 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950}
5951
Alexander Belopolsky40018472011-02-26 01:02:56 +00005952PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005953PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5954 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005956 PyObject *result;
5957 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5958 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00005959 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005960 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
5961 Py_DECREF(tmp);
5962 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005963}
5964
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005965/* --- Unicode Internal Codec ------------------------------------------- */
5966
Alexander Belopolsky40018472011-02-26 01:02:56 +00005967PyObject *
5968_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005969 Py_ssize_t size,
5970 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005971{
5972 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005973 Py_ssize_t startinpos;
5974 Py_ssize_t endinpos;
5975 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005976 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005977 const char *end;
5978 const char *reason;
5979 PyObject *errorHandler = NULL;
5980 PyObject *exc = NULL;
5981
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005982 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02005983 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01005984 1))
5985 return NULL;
5986
Thomas Wouters89f507f2006-12-13 04:49:30 +00005987 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005988 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005989 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005990 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005991 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005992 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005993 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00005994 end = s + size;
5995
5996 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01005997 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01005998 Py_UCS4 ch;
5999 /* We copy the raw representation one byte at a time because the
6000 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006001 ((char *) &uch)[0] = s[0];
6002 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006003#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006004 ((char *) &uch)[2] = s[2];
6005 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006006#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006007 ch = uch;
6008
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006009 /* We have to sanity check the raw data, otherwise doom looms for
6010 some malformed UCS-4 data. */
6011 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006012#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006013 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006014#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006015 end-s < Py_UNICODE_SIZE
6016 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006018 startinpos = s - starts;
6019 if (end-s < Py_UNICODE_SIZE) {
6020 endinpos = end-starts;
6021 reason = "truncated input";
6022 }
6023 else {
6024 endinpos = s - starts + Py_UNICODE_SIZE;
6025 reason = "illegal code point (> 0x10FFFF)";
6026 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006027 if (unicode_decode_call_errorhandler(
6028 errors, &errorHandler,
6029 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006030 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006031 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006032 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006033 continue;
6034 }
6035
6036 s += Py_UNICODE_SIZE;
6037#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006038 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006039 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006040 Py_UNICODE uch2;
6041 ((char *) &uch2)[0] = s[0];
6042 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006043 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006044 {
Victor Stinner551ac952011-11-29 22:58:13 +01006045 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006046 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006047 }
6048 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006049#endif
6050
6051 if (unicode_putchar(&v, &outpos, ch) < 0)
6052 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006053 }
6054
Victor Stinner16e6a802011-12-12 13:24:15 +01006055 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006056 goto onError;
6057 Py_XDECREF(errorHandler);
6058 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006059 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006060
Benjamin Peterson29060642009-01-31 22:14:21 +00006061 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006062 Py_XDECREF(v);
6063 Py_XDECREF(errorHandler);
6064 Py_XDECREF(exc);
6065 return NULL;
6066}
6067
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068/* --- Latin-1 Codec ------------------------------------------------------ */
6069
Alexander Belopolsky40018472011-02-26 01:02:56 +00006070PyObject *
6071PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006072 Py_ssize_t size,
6073 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006076 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077}
6078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006080static void
6081make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006082 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006083 PyObject *unicode,
6084 Py_ssize_t startpos, Py_ssize_t endpos,
6085 const char *reason)
6086{
6087 if (*exceptionObject == NULL) {
6088 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006089 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006090 encoding, unicode, startpos, endpos, reason);
6091 }
6092 else {
6093 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6094 goto onError;
6095 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6096 goto onError;
6097 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6098 goto onError;
6099 return;
6100 onError:
6101 Py_DECREF(*exceptionObject);
6102 *exceptionObject = NULL;
6103 }
6104}
6105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006106/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006107static void
6108raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006109 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006110 PyObject *unicode,
6111 Py_ssize_t startpos, Py_ssize_t endpos,
6112 const char *reason)
6113{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006114 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006115 encoding, unicode, startpos, endpos, reason);
6116 if (*exceptionObject != NULL)
6117 PyCodec_StrictErrors(*exceptionObject);
6118}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006119
6120/* error handling callback helper:
6121 build arguments, call the callback and check the arguments,
6122 put the result into newpos and return the replacement string, which
6123 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006124static PyObject *
6125unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006126 PyObject **errorHandler,
6127 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006128 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006129 Py_ssize_t startpos, Py_ssize_t endpos,
6130 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006132 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006133 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006134 PyObject *restuple;
6135 PyObject *resunicode;
6136
6137 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006139 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006141 }
6142
Benjamin Petersonbac79492012-01-14 13:34:47 -05006143 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006144 return NULL;
6145 len = PyUnicode_GET_LENGTH(unicode);
6146
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006147 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006148 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006149 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006150 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006151
6152 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006154 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006155 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006156 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006157 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 Py_DECREF(restuple);
6159 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006160 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006161 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006162 &resunicode, newpos)) {
6163 Py_DECREF(restuple);
6164 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006165 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006166 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6167 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6168 Py_DECREF(restuple);
6169 return NULL;
6170 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006171 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006172 *newpos = len + *newpos;
6173 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006174 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6175 Py_DECREF(restuple);
6176 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006177 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006178 Py_INCREF(resunicode);
6179 Py_DECREF(restuple);
6180 return resunicode;
6181}
6182
Alexander Belopolsky40018472011-02-26 01:02:56 +00006183static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006184unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006185 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006186 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006187{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006188 /* input state */
6189 Py_ssize_t pos=0, size;
6190 int kind;
6191 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006192 /* output object */
6193 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006194 /* pointer into the output */
6195 char *str;
6196 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006197 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006198 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6199 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006200 PyObject *errorHandler = NULL;
6201 PyObject *exc = NULL;
6202 /* the following variable is used for caching string comparisons
6203 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6204 int known_errorHandler = -1;
6205
Benjamin Petersonbac79492012-01-14 13:34:47 -05006206 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006207 return NULL;
6208 size = PyUnicode_GET_LENGTH(unicode);
6209 kind = PyUnicode_KIND(unicode);
6210 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006211 /* allocate enough for a simple encoding without
6212 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006213 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006214 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006215 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006216 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006217 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006218 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006219 ressize = size;
6220
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006221 while (pos < size) {
6222 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006223
Benjamin Peterson29060642009-01-31 22:14:21 +00006224 /* can we encode this? */
6225 if (c<limit) {
6226 /* no overflow check, because we know that the space is enough */
6227 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006228 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006229 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006230 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 Py_ssize_t requiredsize;
6232 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006233 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006234 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006235 Py_ssize_t collstart = pos;
6236 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006237 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006238 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006239 ++collend;
6240 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6241 if (known_errorHandler==-1) {
6242 if ((errors==NULL) || (!strcmp(errors, "strict")))
6243 known_errorHandler = 1;
6244 else if (!strcmp(errors, "replace"))
6245 known_errorHandler = 2;
6246 else if (!strcmp(errors, "ignore"))
6247 known_errorHandler = 3;
6248 else if (!strcmp(errors, "xmlcharrefreplace"))
6249 known_errorHandler = 4;
6250 else
6251 known_errorHandler = 0;
6252 }
6253 switch (known_errorHandler) {
6254 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006255 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 goto onError;
6257 case 2: /* replace */
6258 while (collstart++<collend)
6259 *str++ = '?'; /* fall through */
6260 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006261 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 break;
6263 case 4: /* xmlcharrefreplace */
6264 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006265 /* determine replacement size */
6266 for (i = collstart, repsize = 0; i < collend; ++i) {
6267 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6268 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006270 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006272 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006274 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006276 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006278 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006280 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006281 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006283 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006284 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006285 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 if (requiredsize > ressize) {
6287 if (requiredsize<2*ressize)
6288 requiredsize = 2*ressize;
6289 if (_PyBytes_Resize(&res, requiredsize))
6290 goto onError;
6291 str = PyBytes_AS_STRING(res) + respos;
6292 ressize = requiredsize;
6293 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006294 /* generate replacement */
6295 for (i = collstart; i < collend; ++i) {
6296 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006298 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 break;
6300 default:
6301 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006302 encoding, reason, unicode, &exc,
6303 collstart, collend, &newpos);
6304 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006305 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006307 if (PyBytes_Check(repunicode)) {
6308 /* Directly copy bytes result to output. */
6309 repsize = PyBytes_Size(repunicode);
6310 if (repsize > 1) {
6311 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006312 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006313 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6314 Py_DECREF(repunicode);
6315 goto onError;
6316 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006317 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006318 ressize += repsize-1;
6319 }
6320 memcpy(str, PyBytes_AsString(repunicode), repsize);
6321 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006322 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006323 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006324 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006325 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006326 /* need more space? (at least enough for what we
6327 have+the replacement+the rest of the string, so
6328 we won't have to check space for encodable characters) */
6329 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006330 repsize = PyUnicode_GET_LENGTH(repunicode);
6331 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006332 if (requiredsize > ressize) {
6333 if (requiredsize<2*ressize)
6334 requiredsize = 2*ressize;
6335 if (_PyBytes_Resize(&res, requiredsize)) {
6336 Py_DECREF(repunicode);
6337 goto onError;
6338 }
6339 str = PyBytes_AS_STRING(res) + respos;
6340 ressize = requiredsize;
6341 }
6342 /* check if there is anything unencodable in the replacement
6343 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006344 for (i = 0; repsize-->0; ++i, ++str) {
6345 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006347 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006348 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 Py_DECREF(repunicode);
6350 goto onError;
6351 }
6352 *str = (char)c;
6353 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006354 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006355 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006356 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006357 }
6358 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006359 /* Resize if we allocated to much */
6360 size = str - PyBytes_AS_STRING(res);
6361 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006362 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006363 if (_PyBytes_Resize(&res, size) < 0)
6364 goto onError;
6365 }
6366
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006367 Py_XDECREF(errorHandler);
6368 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006369 return res;
6370
6371 onError:
6372 Py_XDECREF(res);
6373 Py_XDECREF(errorHandler);
6374 Py_XDECREF(exc);
6375 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006376}
6377
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006378/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006379PyObject *
6380PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006381 Py_ssize_t size,
6382 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006384 PyObject *result;
6385 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6386 if (unicode == NULL)
6387 return NULL;
6388 result = unicode_encode_ucs1(unicode, errors, 256);
6389 Py_DECREF(unicode);
6390 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391}
6392
Alexander Belopolsky40018472011-02-26 01:02:56 +00006393PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006394_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395{
6396 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 PyErr_BadArgument();
6398 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006400 if (PyUnicode_READY(unicode) == -1)
6401 return NULL;
6402 /* Fast path: if it is a one-byte string, construct
6403 bytes object directly. */
6404 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6405 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6406 PyUnicode_GET_LENGTH(unicode));
6407 /* Non-Latin-1 characters present. Defer to above function to
6408 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006409 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006410}
6411
6412PyObject*
6413PyUnicode_AsLatin1String(PyObject *unicode)
6414{
6415 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416}
6417
6418/* --- 7-bit ASCII Codec -------------------------------------------------- */
6419
Alexander Belopolsky40018472011-02-26 01:02:56 +00006420PyObject *
6421PyUnicode_DecodeASCII(const char *s,
6422 Py_ssize_t size,
6423 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006425 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006426 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006427 int kind;
6428 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006429 Py_ssize_t startinpos;
6430 Py_ssize_t endinpos;
6431 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006432 const char *e;
6433 PyObject *errorHandler = NULL;
6434 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006435
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006436 if (size == 0) {
6437 Py_INCREF(unicode_empty);
6438 return unicode_empty;
6439 }
6440
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006442 if (size == 1 && (unsigned char)s[0] < 128)
6443 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006444
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006445 unicode = PyUnicode_New(size, 127);
6446 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006447 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006449 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006450 data = PyUnicode_1BYTE_DATA(unicode);
6451 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6452 if (outpos == size)
6453 return unicode;
6454
6455 s += outpos;
6456 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006457 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 register unsigned char c = (unsigned char)*s;
6459 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006460 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 ++s;
6462 }
6463 else {
6464 startinpos = s-starts;
6465 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 if (unicode_decode_call_errorhandler(
6467 errors, &errorHandler,
6468 "ascii", "ordinal not in range(128)",
6469 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006470 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006471 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006472 kind = PyUnicode_KIND(unicode);
6473 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006476 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006477 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006478 Py_XDECREF(errorHandler);
6479 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006480 assert(_PyUnicode_CheckConsistency(unicode, 1));
6481 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006482
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006484 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006485 Py_XDECREF(errorHandler);
6486 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487 return NULL;
6488}
6489
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006490/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006491PyObject *
6492PyUnicode_EncodeASCII(const Py_UNICODE *p,
6493 Py_ssize_t size,
6494 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006496 PyObject *result;
6497 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6498 if (unicode == NULL)
6499 return NULL;
6500 result = unicode_encode_ucs1(unicode, errors, 128);
6501 Py_DECREF(unicode);
6502 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503}
6504
Alexander Belopolsky40018472011-02-26 01:02:56 +00006505PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006506_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507{
6508 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 PyErr_BadArgument();
6510 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006511 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006512 if (PyUnicode_READY(unicode) == -1)
6513 return NULL;
6514 /* Fast path: if it is an ASCII-only string, construct bytes object
6515 directly. Else defer to above function to raise the exception. */
6516 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6517 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6518 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006519 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006520}
6521
6522PyObject *
6523PyUnicode_AsASCIIString(PyObject *unicode)
6524{
6525 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526}
6527
Victor Stinner99b95382011-07-04 14:23:54 +02006528#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006529
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006530/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006531
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006532#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006533#define NEED_RETRY
6534#endif
6535
Victor Stinner3a50e702011-10-18 21:21:00 +02006536#ifndef WC_ERR_INVALID_CHARS
6537# define WC_ERR_INVALID_CHARS 0x0080
6538#endif
6539
6540static char*
6541code_page_name(UINT code_page, PyObject **obj)
6542{
6543 *obj = NULL;
6544 if (code_page == CP_ACP)
6545 return "mbcs";
6546 if (code_page == CP_UTF7)
6547 return "CP_UTF7";
6548 if (code_page == CP_UTF8)
6549 return "CP_UTF8";
6550
6551 *obj = PyBytes_FromFormat("cp%u", code_page);
6552 if (*obj == NULL)
6553 return NULL;
6554 return PyBytes_AS_STRING(*obj);
6555}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006556
Alexander Belopolsky40018472011-02-26 01:02:56 +00006557static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006558is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006559{
6560 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006561 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006562
Victor Stinner3a50e702011-10-18 21:21:00 +02006563 if (!IsDBCSLeadByteEx(code_page, *curr))
6564 return 0;
6565
6566 prev = CharPrevExA(code_page, s, curr, 0);
6567 if (prev == curr)
6568 return 1;
6569 /* FIXME: This code is limited to "true" double-byte encodings,
6570 as it assumes an incomplete character consists of a single
6571 byte. */
6572 if (curr - prev == 2)
6573 return 1;
6574 if (!IsDBCSLeadByteEx(code_page, *prev))
6575 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006576 return 0;
6577}
6578
Victor Stinner3a50e702011-10-18 21:21:00 +02006579static DWORD
6580decode_code_page_flags(UINT code_page)
6581{
6582 if (code_page == CP_UTF7) {
6583 /* The CP_UTF7 decoder only supports flags=0 */
6584 return 0;
6585 }
6586 else
6587 return MB_ERR_INVALID_CHARS;
6588}
6589
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006590/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006591 * Decode a byte string from a Windows code page into unicode object in strict
6592 * mode.
6593 *
6594 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6595 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006596 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006597static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006598decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006599 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006600 const char *in,
6601 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006602{
Victor Stinner3a50e702011-10-18 21:21:00 +02006603 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006604 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006605 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006606
6607 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006608 assert(insize > 0);
6609 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6610 if (outsize <= 0)
6611 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006612
6613 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006615 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006616 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006617 if (*v == NULL)
6618 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006619 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006620 }
6621 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006623 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006624 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006626 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006627 }
6628
6629 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006630 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6631 if (outsize <= 0)
6632 goto error;
6633 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006634
Victor Stinner3a50e702011-10-18 21:21:00 +02006635error:
6636 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6637 return -2;
6638 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006639 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006640}
6641
Victor Stinner3a50e702011-10-18 21:21:00 +02006642/*
6643 * Decode a byte string from a code page into unicode object with an error
6644 * handler.
6645 *
6646 * Returns consumed size if succeed, or raise a WindowsError or
6647 * UnicodeDecodeError exception and returns -1 on error.
6648 */
6649static int
6650decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006651 PyObject **v,
6652 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006653 const char *errors)
6654{
6655 const char *startin = in;
6656 const char *endin = in + size;
6657 const DWORD flags = decode_code_page_flags(code_page);
6658 /* Ideally, we should get reason from FormatMessage. This is the Windows
6659 2000 English version of the message. */
6660 const char *reason = "No mapping for the Unicode character exists "
6661 "in the target code page.";
6662 /* each step cannot decode more than 1 character, but a character can be
6663 represented as a surrogate pair */
6664 wchar_t buffer[2], *startout, *out;
6665 int insize, outsize;
6666 PyObject *errorHandler = NULL;
6667 PyObject *exc = NULL;
6668 PyObject *encoding_obj = NULL;
6669 char *encoding;
6670 DWORD err;
6671 int ret = -1;
6672
6673 assert(size > 0);
6674
6675 encoding = code_page_name(code_page, &encoding_obj);
6676 if (encoding == NULL)
6677 return -1;
6678
6679 if (errors == NULL || strcmp(errors, "strict") == 0) {
6680 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6681 UnicodeDecodeError. */
6682 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6683 if (exc != NULL) {
6684 PyCodec_StrictErrors(exc);
6685 Py_CLEAR(exc);
6686 }
6687 goto error;
6688 }
6689
6690 if (*v == NULL) {
6691 /* Create unicode object */
6692 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6693 PyErr_NoMemory();
6694 goto error;
6695 }
Victor Stinnerab595942011-12-17 04:59:06 +01006696 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006697 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006698 if (*v == NULL)
6699 goto error;
6700 startout = PyUnicode_AS_UNICODE(*v);
6701 }
6702 else {
6703 /* Extend unicode object */
6704 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6705 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6706 PyErr_NoMemory();
6707 goto error;
6708 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006709 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006710 goto error;
6711 startout = PyUnicode_AS_UNICODE(*v) + n;
6712 }
6713
6714 /* Decode the byte string character per character */
6715 out = startout;
6716 while (in < endin)
6717 {
6718 /* Decode a character */
6719 insize = 1;
6720 do
6721 {
6722 outsize = MultiByteToWideChar(code_page, flags,
6723 in, insize,
6724 buffer, Py_ARRAY_LENGTH(buffer));
6725 if (outsize > 0)
6726 break;
6727 err = GetLastError();
6728 if (err != ERROR_NO_UNICODE_TRANSLATION
6729 && err != ERROR_INSUFFICIENT_BUFFER)
6730 {
6731 PyErr_SetFromWindowsErr(0);
6732 goto error;
6733 }
6734 insize++;
6735 }
6736 /* 4=maximum length of a UTF-8 sequence */
6737 while (insize <= 4 && (in + insize) <= endin);
6738
6739 if (outsize <= 0) {
6740 Py_ssize_t startinpos, endinpos, outpos;
6741
6742 startinpos = in - startin;
6743 endinpos = startinpos + 1;
6744 outpos = out - PyUnicode_AS_UNICODE(*v);
6745 if (unicode_decode_call_errorhandler(
6746 errors, &errorHandler,
6747 encoding, reason,
6748 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006749 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006750 {
6751 goto error;
6752 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006753 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006754 }
6755 else {
6756 in += insize;
6757 memcpy(out, buffer, outsize * sizeof(wchar_t));
6758 out += outsize;
6759 }
6760 }
6761
6762 /* write a NUL character at the end */
6763 *out = 0;
6764
6765 /* Extend unicode object */
6766 outsize = out - startout;
6767 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006768 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006769 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006770 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006771
6772error:
6773 Py_XDECREF(encoding_obj);
6774 Py_XDECREF(errorHandler);
6775 Py_XDECREF(exc);
6776 return ret;
6777}
6778
Victor Stinner3a50e702011-10-18 21:21:00 +02006779static PyObject *
6780decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006781 const char *s, Py_ssize_t size,
6782 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006783{
Victor Stinner76a31a62011-11-04 00:05:13 +01006784 PyObject *v = NULL;
6785 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006786
Victor Stinner3a50e702011-10-18 21:21:00 +02006787 if (code_page < 0) {
6788 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6789 return NULL;
6790 }
6791
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006792 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006793 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006794
Victor Stinner76a31a62011-11-04 00:05:13 +01006795 do
6796 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006797#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006798 if (size > INT_MAX) {
6799 chunk_size = INT_MAX;
6800 final = 0;
6801 done = 0;
6802 }
6803 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006804#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006805 {
6806 chunk_size = (int)size;
6807 final = (consumed == NULL);
6808 done = 1;
6809 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006810
Victor Stinner76a31a62011-11-04 00:05:13 +01006811 /* Skip trailing lead-byte unless 'final' is set */
6812 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6813 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006814
Victor Stinner76a31a62011-11-04 00:05:13 +01006815 if (chunk_size == 0 && done) {
6816 if (v != NULL)
6817 break;
6818 Py_INCREF(unicode_empty);
6819 return unicode_empty;
6820 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006821
Victor Stinner76a31a62011-11-04 00:05:13 +01006822
6823 converted = decode_code_page_strict(code_page, &v,
6824 s, chunk_size);
6825 if (converted == -2)
6826 converted = decode_code_page_errors(code_page, &v,
6827 s, chunk_size,
6828 errors);
6829 assert(converted != 0);
6830
6831 if (converted < 0) {
6832 Py_XDECREF(v);
6833 return NULL;
6834 }
6835
6836 if (consumed)
6837 *consumed += converted;
6838
6839 s += converted;
6840 size -= converted;
6841 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006842
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006843 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006844}
6845
Alexander Belopolsky40018472011-02-26 01:02:56 +00006846PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006847PyUnicode_DecodeCodePageStateful(int code_page,
6848 const char *s,
6849 Py_ssize_t size,
6850 const char *errors,
6851 Py_ssize_t *consumed)
6852{
6853 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6854}
6855
6856PyObject *
6857PyUnicode_DecodeMBCSStateful(const char *s,
6858 Py_ssize_t size,
6859 const char *errors,
6860 Py_ssize_t *consumed)
6861{
6862 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6863}
6864
6865PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006866PyUnicode_DecodeMBCS(const char *s,
6867 Py_ssize_t size,
6868 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006869{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006870 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6871}
6872
Victor Stinner3a50e702011-10-18 21:21:00 +02006873static DWORD
6874encode_code_page_flags(UINT code_page, const char *errors)
6875{
6876 if (code_page == CP_UTF8) {
6877 if (winver.dwMajorVersion >= 6)
6878 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6879 and later */
6880 return WC_ERR_INVALID_CHARS;
6881 else
6882 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6883 return 0;
6884 }
6885 else if (code_page == CP_UTF7) {
6886 /* CP_UTF7 only supports flags=0 */
6887 return 0;
6888 }
6889 else {
6890 if (errors != NULL && strcmp(errors, "replace") == 0)
6891 return 0;
6892 else
6893 return WC_NO_BEST_FIT_CHARS;
6894 }
6895}
6896
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006897/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006898 * Encode a Unicode string to a Windows code page into a byte string in strict
6899 * mode.
6900 *
6901 * Returns consumed characters if succeed, returns -2 on encode error, or raise
6902 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006903 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006904static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006905encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01006906 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02006907 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006908{
Victor Stinner554f3f02010-06-16 23:33:54 +00006909 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02006910 BOOL *pusedDefaultChar = &usedDefaultChar;
6911 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006912 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01006913 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006914 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006915 const DWORD flags = encode_code_page_flags(code_page, NULL);
6916 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006917 /* Create a substring so that we can get the UTF-16 representation
6918 of just the slice under consideration. */
6919 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006920
Martin v. Löwis3d325192011-11-04 18:23:06 +01006921 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006922
Victor Stinner3a50e702011-10-18 21:21:00 +02006923 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00006924 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02006925 else
Victor Stinner554f3f02010-06-16 23:33:54 +00006926 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00006927
Victor Stinner2fc507f2011-11-04 20:06:39 +01006928 substring = PyUnicode_Substring(unicode, offset, offset+len);
6929 if (substring == NULL)
6930 return -1;
6931 p = PyUnicode_AsUnicodeAndSize(substring, &size);
6932 if (p == NULL) {
6933 Py_DECREF(substring);
6934 return -1;
6935 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01006936
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006937 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006938 outsize = WideCharToMultiByte(code_page, flags,
6939 p, size,
6940 NULL, 0,
6941 NULL, pusedDefaultChar);
6942 if (outsize <= 0)
6943 goto error;
6944 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01006945 if (pusedDefaultChar && *pusedDefaultChar) {
6946 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006947 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006948 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006949
Victor Stinner3a50e702011-10-18 21:21:00 +02006950 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006952 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006953 if (*outbytes == NULL) {
6954 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006955 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006956 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006957 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006958 }
6959 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006960 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006961 const Py_ssize_t n = PyBytes_Size(*outbytes);
6962 if (outsize > PY_SSIZE_T_MAX - n) {
6963 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01006964 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006966 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01006967 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
6968 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006969 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01006970 }
Victor Stinner3a50e702011-10-18 21:21:00 +02006971 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006972 }
6973
6974 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006975 outsize = WideCharToMultiByte(code_page, flags,
6976 p, size,
6977 out, outsize,
6978 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01006979 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006980 if (outsize <= 0)
6981 goto error;
6982 if (pusedDefaultChar && *pusedDefaultChar)
6983 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006984 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006985
Victor Stinner3a50e702011-10-18 21:21:00 +02006986error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01006987 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02006988 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6989 return -2;
6990 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006991 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006992}
6993
Victor Stinner3a50e702011-10-18 21:21:00 +02006994/*
6995 * Encode a Unicode string to a Windows code page into a byte string using a
6996 * error handler.
6997 *
6998 * Returns consumed characters if succeed, or raise a WindowsError and returns
6999 * -1 on other error.
7000 */
7001static int
7002encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007003 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007004 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007005{
Victor Stinner3a50e702011-10-18 21:21:00 +02007006 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007007 Py_ssize_t pos = unicode_offset;
7008 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007009 /* Ideally, we should get reason from FormatMessage. This is the Windows
7010 2000 English version of the message. */
7011 const char *reason = "invalid character";
7012 /* 4=maximum length of a UTF-8 sequence */
7013 char buffer[4];
7014 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7015 Py_ssize_t outsize;
7016 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007017 PyObject *errorHandler = NULL;
7018 PyObject *exc = NULL;
7019 PyObject *encoding_obj = NULL;
7020 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007021 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007022 PyObject *rep;
7023 int ret = -1;
7024
7025 assert(insize > 0);
7026
7027 encoding = code_page_name(code_page, &encoding_obj);
7028 if (encoding == NULL)
7029 return -1;
7030
7031 if (errors == NULL || strcmp(errors, "strict") == 0) {
7032 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7033 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007034 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007035 if (exc != NULL) {
7036 PyCodec_StrictErrors(exc);
7037 Py_DECREF(exc);
7038 }
7039 Py_XDECREF(encoding_obj);
7040 return -1;
7041 }
7042
7043 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7044 pusedDefaultChar = &usedDefaultChar;
7045 else
7046 pusedDefaultChar = NULL;
7047
7048 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7049 PyErr_NoMemory();
7050 goto error;
7051 }
7052 outsize = insize * Py_ARRAY_LENGTH(buffer);
7053
7054 if (*outbytes == NULL) {
7055 /* Create string object */
7056 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7057 if (*outbytes == NULL)
7058 goto error;
7059 out = PyBytes_AS_STRING(*outbytes);
7060 }
7061 else {
7062 /* Extend string object */
7063 Py_ssize_t n = PyBytes_Size(*outbytes);
7064 if (n > PY_SSIZE_T_MAX - outsize) {
7065 PyErr_NoMemory();
7066 goto error;
7067 }
7068 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7069 goto error;
7070 out = PyBytes_AS_STRING(*outbytes) + n;
7071 }
7072
7073 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007074 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007075 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007076 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7077 wchar_t chars[2];
7078 int charsize;
7079 if (ch < 0x10000) {
7080 chars[0] = (wchar_t)ch;
7081 charsize = 1;
7082 }
7083 else {
7084 ch -= 0x10000;
7085 chars[0] = 0xd800 + (ch >> 10);
7086 chars[1] = 0xdc00 + (ch & 0x3ff);
7087 charsize = 2;
7088 }
7089
Victor Stinner3a50e702011-10-18 21:21:00 +02007090 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007091 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007092 buffer, Py_ARRAY_LENGTH(buffer),
7093 NULL, pusedDefaultChar);
7094 if (outsize > 0) {
7095 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7096 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007097 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007098 memcpy(out, buffer, outsize);
7099 out += outsize;
7100 continue;
7101 }
7102 }
7103 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7104 PyErr_SetFromWindowsErr(0);
7105 goto error;
7106 }
7107
Victor Stinner3a50e702011-10-18 21:21:00 +02007108 rep = unicode_encode_call_errorhandler(
7109 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007110 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007111 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007112 if (rep == NULL)
7113 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007114 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007115
7116 if (PyBytes_Check(rep)) {
7117 outsize = PyBytes_GET_SIZE(rep);
7118 if (outsize != 1) {
7119 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7120 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7121 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7122 Py_DECREF(rep);
7123 goto error;
7124 }
7125 out = PyBytes_AS_STRING(*outbytes) + offset;
7126 }
7127 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7128 out += outsize;
7129 }
7130 else {
7131 Py_ssize_t i;
7132 enum PyUnicode_Kind kind;
7133 void *data;
7134
Benjamin Petersonbac79492012-01-14 13:34:47 -05007135 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007136 Py_DECREF(rep);
7137 goto error;
7138 }
7139
7140 outsize = PyUnicode_GET_LENGTH(rep);
7141 if (outsize != 1) {
7142 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7143 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7144 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7145 Py_DECREF(rep);
7146 goto error;
7147 }
7148 out = PyBytes_AS_STRING(*outbytes) + offset;
7149 }
7150 kind = PyUnicode_KIND(rep);
7151 data = PyUnicode_DATA(rep);
7152 for (i=0; i < outsize; i++) {
7153 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7154 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007155 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007156 encoding, unicode,
7157 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007158 "unable to encode error handler result to ASCII");
7159 Py_DECREF(rep);
7160 goto error;
7161 }
7162 *out = (unsigned char)ch;
7163 out++;
7164 }
7165 }
7166 Py_DECREF(rep);
7167 }
7168 /* write a NUL byte */
7169 *out = 0;
7170 outsize = out - PyBytes_AS_STRING(*outbytes);
7171 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7172 if (_PyBytes_Resize(outbytes, outsize) < 0)
7173 goto error;
7174 ret = 0;
7175
7176error:
7177 Py_XDECREF(encoding_obj);
7178 Py_XDECREF(errorHandler);
7179 Py_XDECREF(exc);
7180 return ret;
7181}
7182
Victor Stinner3a50e702011-10-18 21:21:00 +02007183static PyObject *
7184encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007185 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007186 const char *errors)
7187{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007188 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007189 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007190 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007191 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007192
Benjamin Petersonbac79492012-01-14 13:34:47 -05007193 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007194 return NULL;
7195 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007196
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 if (code_page < 0) {
7198 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7199 return NULL;
7200 }
7201
Martin v. Löwis3d325192011-11-04 18:23:06 +01007202 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007203 return PyBytes_FromStringAndSize(NULL, 0);
7204
Victor Stinner7581cef2011-11-03 22:32:33 +01007205 offset = 0;
7206 do
7207 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007208#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007209 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007210 chunks. */
7211 if (len > INT_MAX/2) {
7212 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007213 done = 0;
7214 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007215 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007216#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007217 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007218 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007219 done = 1;
7220 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007221
Victor Stinner76a31a62011-11-04 00:05:13 +01007222 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007223 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007224 errors);
7225 if (ret == -2)
7226 ret = encode_code_page_errors(code_page, &outbytes,
7227 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007228 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007229 if (ret < 0) {
7230 Py_XDECREF(outbytes);
7231 return NULL;
7232 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007233
Victor Stinner7581cef2011-11-03 22:32:33 +01007234 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007235 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007236 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007237
Victor Stinner3a50e702011-10-18 21:21:00 +02007238 return outbytes;
7239}
7240
7241PyObject *
7242PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7243 Py_ssize_t size,
7244 const char *errors)
7245{
Victor Stinner7581cef2011-11-03 22:32:33 +01007246 PyObject *unicode, *res;
7247 unicode = PyUnicode_FromUnicode(p, size);
7248 if (unicode == NULL)
7249 return NULL;
7250 res = encode_code_page(CP_ACP, unicode, errors);
7251 Py_DECREF(unicode);
7252 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007253}
7254
7255PyObject *
7256PyUnicode_EncodeCodePage(int code_page,
7257 PyObject *unicode,
7258 const char *errors)
7259{
Victor Stinner7581cef2011-11-03 22:32:33 +01007260 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007261}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007262
Alexander Belopolsky40018472011-02-26 01:02:56 +00007263PyObject *
7264PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007265{
7266 if (!PyUnicode_Check(unicode)) {
7267 PyErr_BadArgument();
7268 return NULL;
7269 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007270 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007271}
7272
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007273#undef NEED_RETRY
7274
Victor Stinner99b95382011-07-04 14:23:54 +02007275#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007276
Guido van Rossumd57fd912000-03-10 22:53:23 +00007277/* --- Character Mapping Codec -------------------------------------------- */
7278
Alexander Belopolsky40018472011-02-26 01:02:56 +00007279PyObject *
7280PyUnicode_DecodeCharmap(const char *s,
7281 Py_ssize_t size,
7282 PyObject *mapping,
7283 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007284{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007285 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007286 Py_ssize_t startinpos;
7287 Py_ssize_t endinpos;
7288 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007289 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007290 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007291 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007292 PyObject *errorHandler = NULL;
7293 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007294
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295 /* Default to Latin-1 */
7296 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007297 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007299 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007300 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007301 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007303 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007304 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007305 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007306 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007307 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007308 enum PyUnicode_Kind mapkind;
7309 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007310 Py_UCS4 x;
7311
Benjamin Petersonbac79492012-01-14 13:34:47 -05007312 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007313 return NULL;
7314
7315 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007316 mapdata = PyUnicode_DATA(mapping);
7317 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007318 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007319 unsigned char ch;
7320 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7321 enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7322 if (outkind == PyUnicode_1BYTE_KIND) {
7323 void *outdata = PyUnicode_DATA(v);
7324 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7325 while (s < e) {
7326 unsigned char ch = *s;
7327 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7328 if (x > maxchar)
7329 goto Error;
7330 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7331 ++s;
7332 }
7333 break;
7334 }
7335 else if (outkind == PyUnicode_2BYTE_KIND) {
7336 void *outdata = PyUnicode_DATA(v);
7337 while (s < e) {
7338 unsigned char ch = *s;
7339 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7340 if (x == 0xFFFE)
7341 goto Error;
7342 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7343 ++s;
7344 }
7345 break;
7346 }
7347 }
7348 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349
Benjamin Peterson29060642009-01-31 22:14:21 +00007350 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007351 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007352 else
7353 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007354Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007355 if (x == 0xfffe)
7356 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007357 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007358 startinpos = s-starts;
7359 endinpos = startinpos+1;
7360 if (unicode_decode_call_errorhandler(
7361 errors, &errorHandler,
7362 "charmap", "character maps to <undefined>",
7363 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007364 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007365 goto onError;
7366 }
7367 continue;
7368 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007369
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007370 if (unicode_putchar(&v, &outpos, x) < 0)
7371 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007372 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007373 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007374 }
7375 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007376 while (s < e) {
7377 unsigned char ch = *s;
7378 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007379
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7381 w = PyLong_FromLong((long)ch);
7382 if (w == NULL)
7383 goto onError;
7384 x = PyObject_GetItem(mapping, w);
7385 Py_DECREF(w);
7386 if (x == NULL) {
7387 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7388 /* No mapping found means: mapping is undefined. */
7389 PyErr_Clear();
7390 x = Py_None;
7391 Py_INCREF(x);
7392 } else
7393 goto onError;
7394 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007395
Benjamin Peterson29060642009-01-31 22:14:21 +00007396 /* Apply mapping */
7397 if (PyLong_Check(x)) {
7398 long value = PyLong_AS_LONG(x);
Antoine Pitroua1f76552012-09-23 20:00:04 +02007399 if (value < 0 || value > MAX_UNICODE) {
7400 PyErr_Format(PyExc_TypeError,
7401 "character mapping must be in range(0x%lx)",
7402 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007403 Py_DECREF(x);
7404 goto onError;
7405 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007406 if (unicode_putchar(&v, &outpos, value) < 0)
7407 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007408 }
7409 else if (x == Py_None) {
7410 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007411 startinpos = s-starts;
7412 endinpos = startinpos+1;
7413 if (unicode_decode_call_errorhandler(
7414 errors, &errorHandler,
7415 "charmap", "character maps to <undefined>",
7416 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007417 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 Py_DECREF(x);
7419 goto onError;
7420 }
7421 Py_DECREF(x);
7422 continue;
7423 }
7424 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007425 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007426
Benjamin Petersonbac79492012-01-14 13:34:47 -05007427 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007428 goto onError;
7429 targetsize = PyUnicode_GET_LENGTH(x);
7430
7431 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007432 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007433 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007434 PyUnicode_READ_CHAR(x, 0)) < 0)
7435 goto onError;
7436 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 else if (targetsize > 1) {
7438 /* 1-n mapping */
7439 if (targetsize > extrachars) {
7440 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 Py_ssize_t needed = (targetsize - extrachars) + \
7442 (targetsize << 2);
7443 extrachars += needed;
7444 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007445 if (unicode_resize(&v,
7446 PyUnicode_GET_LENGTH(v) + needed) < 0)
7447 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007448 Py_DECREF(x);
7449 goto onError;
7450 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007451 }
Victor Stinner1b487b42012-05-03 12:29:04 +02007452 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007453 goto onError;
7454 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7455 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007456 extrachars -= targetsize;
7457 }
7458 /* 1-0 mapping: skip the character */
7459 }
7460 else {
7461 /* wrong return value */
7462 PyErr_SetString(PyExc_TypeError,
7463 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007464 Py_DECREF(x);
7465 goto onError;
7466 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007467 Py_DECREF(x);
7468 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007469 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007470 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007471 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007472 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007473 Py_XDECREF(errorHandler);
7474 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007475 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007476
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007478 Py_XDECREF(errorHandler);
7479 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480 Py_XDECREF(v);
7481 return NULL;
7482}
7483
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007484/* Charmap encoding: the lookup table */
7485
Alexander Belopolsky40018472011-02-26 01:02:56 +00007486struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007487 PyObject_HEAD
7488 unsigned char level1[32];
7489 int count2, count3;
7490 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007491};
7492
7493static PyObject*
7494encoding_map_size(PyObject *obj, PyObject* args)
7495{
7496 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007497 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007499}
7500
7501static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007502 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007503 PyDoc_STR("Return the size (in bytes) of this object") },
7504 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007505};
7506
7507static void
7508encoding_map_dealloc(PyObject* o)
7509{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007510 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007511}
7512
7513static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007514 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 "EncodingMap", /*tp_name*/
7516 sizeof(struct encoding_map), /*tp_basicsize*/
7517 0, /*tp_itemsize*/
7518 /* methods */
7519 encoding_map_dealloc, /*tp_dealloc*/
7520 0, /*tp_print*/
7521 0, /*tp_getattr*/
7522 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007523 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007524 0, /*tp_repr*/
7525 0, /*tp_as_number*/
7526 0, /*tp_as_sequence*/
7527 0, /*tp_as_mapping*/
7528 0, /*tp_hash*/
7529 0, /*tp_call*/
7530 0, /*tp_str*/
7531 0, /*tp_getattro*/
7532 0, /*tp_setattro*/
7533 0, /*tp_as_buffer*/
7534 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7535 0, /*tp_doc*/
7536 0, /*tp_traverse*/
7537 0, /*tp_clear*/
7538 0, /*tp_richcompare*/
7539 0, /*tp_weaklistoffset*/
7540 0, /*tp_iter*/
7541 0, /*tp_iternext*/
7542 encoding_map_methods, /*tp_methods*/
7543 0, /*tp_members*/
7544 0, /*tp_getset*/
7545 0, /*tp_base*/
7546 0, /*tp_dict*/
7547 0, /*tp_descr_get*/
7548 0, /*tp_descr_set*/
7549 0, /*tp_dictoffset*/
7550 0, /*tp_init*/
7551 0, /*tp_alloc*/
7552 0, /*tp_new*/
7553 0, /*tp_free*/
7554 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007555};
7556
7557PyObject*
7558PyUnicode_BuildEncodingMap(PyObject* string)
7559{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007560 PyObject *result;
7561 struct encoding_map *mresult;
7562 int i;
7563 int need_dict = 0;
7564 unsigned char level1[32];
7565 unsigned char level2[512];
7566 unsigned char *mlevel1, *mlevel2, *mlevel3;
7567 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007568 int kind;
7569 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007570 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007571 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007572
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007573 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007574 PyErr_BadArgument();
7575 return NULL;
7576 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007577 kind = PyUnicode_KIND(string);
7578 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007579 length = PyUnicode_GET_LENGTH(string);
7580 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007581 memset(level1, 0xFF, sizeof level1);
7582 memset(level2, 0xFF, sizeof level2);
7583
7584 /* If there isn't a one-to-one mapping of NULL to \0,
7585 or if there are non-BMP characters, we need to use
7586 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007587 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007588 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007589 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007590 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007591 ch = PyUnicode_READ(kind, data, i);
7592 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007593 need_dict = 1;
7594 break;
7595 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007596 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007597 /* unmapped character */
7598 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007599 l1 = ch >> 11;
7600 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007601 if (level1[l1] == 0xFF)
7602 level1[l1] = count2++;
7603 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007604 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007605 }
7606
7607 if (count2 >= 0xFF || count3 >= 0xFF)
7608 need_dict = 1;
7609
7610 if (need_dict) {
7611 PyObject *result = PyDict_New();
7612 PyObject *key, *value;
7613 if (!result)
7614 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007615 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007616 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007617 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007618 if (!key || !value)
7619 goto failed1;
7620 if (PyDict_SetItem(result, key, value) == -1)
7621 goto failed1;
7622 Py_DECREF(key);
7623 Py_DECREF(value);
7624 }
7625 return result;
7626 failed1:
7627 Py_XDECREF(key);
7628 Py_XDECREF(value);
7629 Py_DECREF(result);
7630 return NULL;
7631 }
7632
7633 /* Create a three-level trie */
7634 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7635 16*count2 + 128*count3 - 1);
7636 if (!result)
7637 return PyErr_NoMemory();
7638 PyObject_Init(result, &EncodingMapType);
7639 mresult = (struct encoding_map*)result;
7640 mresult->count2 = count2;
7641 mresult->count3 = count3;
7642 mlevel1 = mresult->level1;
7643 mlevel2 = mresult->level23;
7644 mlevel3 = mresult->level23 + 16*count2;
7645 memcpy(mlevel1, level1, 32);
7646 memset(mlevel2, 0xFF, 16*count2);
7647 memset(mlevel3, 0, 128*count3);
7648 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007649 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007650 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007651 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7652 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007653 /* unmapped character */
7654 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007655 o1 = ch>>11;
7656 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007657 i2 = 16*mlevel1[o1] + o2;
7658 if (mlevel2[i2] == 0xFF)
7659 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007660 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007661 i3 = 128*mlevel2[i2] + o3;
7662 mlevel3[i3] = i;
7663 }
7664 return result;
7665}
7666
7667static int
Victor Stinner22168992011-11-20 17:09:18 +01007668encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007669{
7670 struct encoding_map *map = (struct encoding_map*)mapping;
7671 int l1 = c>>11;
7672 int l2 = (c>>7) & 0xF;
7673 int l3 = c & 0x7F;
7674 int i;
7675
Victor Stinner22168992011-11-20 17:09:18 +01007676 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007677 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007678 if (c == 0)
7679 return 0;
7680 /* level 1*/
7681 i = map->level1[l1];
7682 if (i == 0xFF) {
7683 return -1;
7684 }
7685 /* level 2*/
7686 i = map->level23[16*i+l2];
7687 if (i == 0xFF) {
7688 return -1;
7689 }
7690 /* level 3 */
7691 i = map->level23[16*map->count2 + 128*i + l3];
7692 if (i == 0) {
7693 return -1;
7694 }
7695 return i;
7696}
7697
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007698/* Lookup the character ch in the mapping. If the character
7699 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007700 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007701static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007702charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703{
Christian Heimes217cfd12007-12-02 14:31:20 +00007704 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007705 PyObject *x;
7706
7707 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007708 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007709 x = PyObject_GetItem(mapping, w);
7710 Py_DECREF(w);
7711 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007712 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7713 /* No mapping found means: mapping is undefined. */
7714 PyErr_Clear();
7715 x = Py_None;
7716 Py_INCREF(x);
7717 return x;
7718 } else
7719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007721 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007722 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007723 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007724 long value = PyLong_AS_LONG(x);
7725 if (value < 0 || value > 255) {
7726 PyErr_SetString(PyExc_TypeError,
7727 "character mapping must be in range(256)");
7728 Py_DECREF(x);
7729 return NULL;
7730 }
7731 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007732 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007733 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007735 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 /* wrong return value */
7737 PyErr_Format(PyExc_TypeError,
7738 "character mapping must return integer, bytes or None, not %.400s",
7739 x->ob_type->tp_name);
7740 Py_DECREF(x);
7741 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007742 }
7743}
7744
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007745static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007746charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007747{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007748 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7749 /* exponentially overallocate to minimize reallocations */
7750 if (requiredsize < 2*outsize)
7751 requiredsize = 2*outsize;
7752 if (_PyBytes_Resize(outobj, requiredsize))
7753 return -1;
7754 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007755}
7756
Benjamin Peterson14339b62009-01-31 16:36:08 +00007757typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007758 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007759} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007760/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007761 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007762 space is available. Return a new reference to the object that
7763 was put in the output buffer, or Py_None, if the mapping was undefined
7764 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007765 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007766static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007767charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007768 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007769{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007770 PyObject *rep;
7771 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007772 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007773
Christian Heimes90aa7642007-12-19 02:45:37 +00007774 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007775 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007776 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007777 if (res == -1)
7778 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 if (outsize<requiredsize)
7780 if (charmapencode_resize(outobj, outpos, requiredsize))
7781 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007782 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007783 outstart[(*outpos)++] = (char)res;
7784 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007785 }
7786
7787 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007788 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007790 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 Py_DECREF(rep);
7792 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007793 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007794 if (PyLong_Check(rep)) {
7795 Py_ssize_t requiredsize = *outpos+1;
7796 if (outsize<requiredsize)
7797 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7798 Py_DECREF(rep);
7799 return enc_EXCEPTION;
7800 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007801 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007802 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007803 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007804 else {
7805 const char *repchars = PyBytes_AS_STRING(rep);
7806 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7807 Py_ssize_t requiredsize = *outpos+repsize;
7808 if (outsize<requiredsize)
7809 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7810 Py_DECREF(rep);
7811 return enc_EXCEPTION;
7812 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007813 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 memcpy(outstart + *outpos, repchars, repsize);
7815 *outpos += repsize;
7816 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007817 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007818 Py_DECREF(rep);
7819 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007820}
7821
7822/* handle an error in PyUnicode_EncodeCharmap
7823 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007824static int
7825charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007826 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007827 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007828 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007829 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007830{
7831 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007832 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007833 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007834 enum PyUnicode_Kind kind;
7835 void *data;
7836 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007837 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007838 Py_ssize_t collstartpos = *inpos;
7839 Py_ssize_t collendpos = *inpos+1;
7840 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007841 char *encoding = "charmap";
7842 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007843 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007844 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007845 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007846
Benjamin Petersonbac79492012-01-14 13:34:47 -05007847 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007848 return -1;
7849 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007850 /* find all unencodable characters */
7851 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007852 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007853 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007854 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007855 val = encoding_map_lookup(ch, mapping);
7856 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007857 break;
7858 ++collendpos;
7859 continue;
7860 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007861
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007862 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7863 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 if (rep==NULL)
7865 return -1;
7866 else if (rep!=Py_None) {
7867 Py_DECREF(rep);
7868 break;
7869 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007870 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007871 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007872 }
7873 /* cache callback name lookup
7874 * (if not done yet, i.e. it's the first error) */
7875 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007876 if ((errors==NULL) || (!strcmp(errors, "strict")))
7877 *known_errorHandler = 1;
7878 else if (!strcmp(errors, "replace"))
7879 *known_errorHandler = 2;
7880 else if (!strcmp(errors, "ignore"))
7881 *known_errorHandler = 3;
7882 else if (!strcmp(errors, "xmlcharrefreplace"))
7883 *known_errorHandler = 4;
7884 else
7885 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007886 }
7887 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007888 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007889 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007890 return -1;
7891 case 2: /* replace */
7892 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 x = charmapencode_output('?', mapping, res, respos);
7894 if (x==enc_EXCEPTION) {
7895 return -1;
7896 }
7897 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007898 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 return -1;
7900 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007901 }
7902 /* fall through */
7903 case 3: /* ignore */
7904 *inpos = collendpos;
7905 break;
7906 case 4: /* xmlcharrefreplace */
7907 /* generate replacement (temporarily (mis)uses p) */
7908 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007909 char buffer[2+29+1+1];
7910 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007911 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00007912 for (cp = buffer; *cp; ++cp) {
7913 x = charmapencode_output(*cp, mapping, res, respos);
7914 if (x==enc_EXCEPTION)
7915 return -1;
7916 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007917 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 return -1;
7919 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007920 }
7921 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007922 *inpos = collendpos;
7923 break;
7924 default:
7925 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007926 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00007927 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007928 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007929 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007930 if (PyBytes_Check(repunicode)) {
7931 /* Directly copy bytes result to output. */
7932 Py_ssize_t outsize = PyBytes_Size(*res);
7933 Py_ssize_t requiredsize;
7934 repsize = PyBytes_Size(repunicode);
7935 requiredsize = *respos + repsize;
7936 if (requiredsize > outsize)
7937 /* Make room for all additional bytes. */
7938 if (charmapencode_resize(res, respos, requiredsize)) {
7939 Py_DECREF(repunicode);
7940 return -1;
7941 }
7942 memcpy(PyBytes_AsString(*res) + *respos,
7943 PyBytes_AsString(repunicode), repsize);
7944 *respos += repsize;
7945 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007946 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007947 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007948 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007949 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05007950 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007951 Py_DECREF(repunicode);
7952 return -1;
7953 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01007954 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007955 data = PyUnicode_DATA(repunicode);
7956 kind = PyUnicode_KIND(repunicode);
7957 for (index = 0; index < repsize; index++) {
7958 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
7959 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007960 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007961 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00007962 return -1;
7963 }
7964 else if (x==enc_FAILED) {
7965 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007966 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 return -1;
7968 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007969 }
7970 *inpos = newpos;
7971 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007972 }
7973 return 0;
7974}
7975
Alexander Belopolsky40018472011-02-26 01:02:56 +00007976PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007977_PyUnicode_EncodeCharmap(PyObject *unicode,
7978 PyObject *mapping,
7979 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007980{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007981 /* output object */
7982 PyObject *res = NULL;
7983 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007984 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007985 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007986 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007987 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007988 PyObject *errorHandler = NULL;
7989 PyObject *exc = NULL;
7990 /* the following variable is used for caching string comparisons
7991 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7992 * 3=ignore, 4=xmlcharrefreplace */
7993 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994
Benjamin Petersonbac79492012-01-14 13:34:47 -05007995 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007996 return NULL;
7997 size = PyUnicode_GET_LENGTH(unicode);
7998
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999 /* Default to Latin-1 */
8000 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008001 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008002
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008003 /* allocate enough for a simple encoding without
8004 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008005 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008006 if (res == NULL)
8007 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008008 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008010
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008011 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008012 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008014 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 if (x==enc_EXCEPTION) /* error */
8016 goto onError;
8017 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008018 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 &exc,
8020 &known_errorHandler, &errorHandler, errors,
8021 &res, &respos)) {
8022 goto onError;
8023 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008024 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 else
8026 /* done with this character => adjust input position */
8027 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008028 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008029
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008030 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008031 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008032 if (_PyBytes_Resize(&res, respos) < 0)
8033 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008034
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008035 Py_XDECREF(exc);
8036 Py_XDECREF(errorHandler);
8037 return res;
8038
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008040 Py_XDECREF(res);
8041 Py_XDECREF(exc);
8042 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008043 return NULL;
8044}
8045
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008046/* Deprecated */
8047PyObject *
8048PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8049 Py_ssize_t size,
8050 PyObject *mapping,
8051 const char *errors)
8052{
8053 PyObject *result;
8054 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8055 if (unicode == NULL)
8056 return NULL;
8057 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8058 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008059 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008060}
8061
Alexander Belopolsky40018472011-02-26 01:02:56 +00008062PyObject *
8063PyUnicode_AsCharmapString(PyObject *unicode,
8064 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065{
8066 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 PyErr_BadArgument();
8068 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008069 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008070 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071}
8072
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008073/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008074static void
8075make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008076 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008077 Py_ssize_t startpos, Py_ssize_t endpos,
8078 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008081 *exceptionObject = _PyUnicodeTranslateError_Create(
8082 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008083 }
8084 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8086 goto onError;
8087 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8088 goto onError;
8089 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8090 goto onError;
8091 return;
8092 onError:
8093 Py_DECREF(*exceptionObject);
8094 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008095 }
8096}
8097
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008098/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008099static void
8100raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008101 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008102 Py_ssize_t startpos, Py_ssize_t endpos,
8103 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104{
8105 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008106 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008107 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008108 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008109}
8110
8111/* error handling callback helper:
8112 build arguments, call the callback and check the arguments,
8113 put the result into newpos and return the replacement string, which
8114 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008115static PyObject *
8116unicode_translate_call_errorhandler(const char *errors,
8117 PyObject **errorHandler,
8118 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008120 Py_ssize_t startpos, Py_ssize_t endpos,
8121 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008122{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008123 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008124
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008125 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008126 PyObject *restuple;
8127 PyObject *resunicode;
8128
8129 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008130 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008132 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008133 }
8134
8135 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008136 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008137 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008138 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008139
8140 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008142 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008143 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008144 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008145 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 Py_DECREF(restuple);
8147 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008148 }
8149 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 &resunicode, &i_newpos)) {
8151 Py_DECREF(restuple);
8152 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008153 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008154 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008155 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008156 else
8157 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008158 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8160 Py_DECREF(restuple);
8161 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008162 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008163 Py_INCREF(resunicode);
8164 Py_DECREF(restuple);
8165 return resunicode;
8166}
8167
8168/* Lookup the character ch in the mapping and put the result in result,
8169 which must be decrefed by the caller.
8170 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008171static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008172charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008173{
Christian Heimes217cfd12007-12-02 14:31:20 +00008174 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008175 PyObject *x;
8176
8177 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008179 x = PyObject_GetItem(mapping, w);
8180 Py_DECREF(w);
8181 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8183 /* No mapping found means: use 1:1 mapping. */
8184 PyErr_Clear();
8185 *result = NULL;
8186 return 0;
8187 } else
8188 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008189 }
8190 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 *result = x;
8192 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008193 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008194 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008195 long value = PyLong_AS_LONG(x);
8196 long max = PyUnicode_GetMax();
8197 if (value < 0 || value > max) {
8198 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008199 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008200 Py_DECREF(x);
8201 return -1;
8202 }
8203 *result = x;
8204 return 0;
8205 }
8206 else if (PyUnicode_Check(x)) {
8207 *result = x;
8208 return 0;
8209 }
8210 else {
8211 /* wrong return value */
8212 PyErr_SetString(PyExc_TypeError,
8213 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008214 Py_DECREF(x);
8215 return -1;
8216 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008217}
8218/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 if not reallocate and adjust various state variables.
8220 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008221static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008222charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008223 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008224{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008225 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008226 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008227 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 /* exponentially overallocate to minimize reallocations */
8229 if (requiredsize < 2 * oldsize)
8230 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008231 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8232 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008233 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008234 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236 }
8237 return 0;
8238}
8239/* lookup the character, put the result in the output string and adjust
8240 various state variables. Return a new reference to the object that
8241 was put in the output buffer in *result, or Py_None, if the mapping was
8242 undefined (in which case no character was written).
8243 The called must decref result.
8244 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008245static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008246charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8247 PyObject *mapping, Py_UCS4 **output,
8248 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008249 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008251 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8252 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008256 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257 }
8258 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008260 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008262 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008263 }
8264 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008265 Py_ssize_t repsize;
8266 if (PyUnicode_READY(*res) == -1)
8267 return -1;
8268 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008269 if (repsize==1) {
8270 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008271 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 }
8273 else if (repsize!=0) {
8274 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008275 Py_ssize_t requiredsize = *opos +
8276 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008278 Py_ssize_t i;
8279 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008281 for(i = 0; i < repsize; i++)
8282 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008283 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008284 }
8285 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008286 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008287 return 0;
8288}
8289
Alexander Belopolsky40018472011-02-26 01:02:56 +00008290PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008291_PyUnicode_TranslateCharmap(PyObject *input,
8292 PyObject *mapping,
8293 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008295 /* input object */
8296 char *idata;
8297 Py_ssize_t size, i;
8298 int kind;
8299 /* output buffer */
8300 Py_UCS4 *output = NULL;
8301 Py_ssize_t osize;
8302 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008303 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008304 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 char *reason = "character maps to <undefined>";
8306 PyObject *errorHandler = NULL;
8307 PyObject *exc = NULL;
8308 /* the following variable is used for caching string comparisons
8309 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8310 * 3=ignore, 4=xmlcharrefreplace */
8311 int known_errorHandler = -1;
8312
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 PyErr_BadArgument();
8315 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008318 if (PyUnicode_READY(input) == -1)
8319 return NULL;
8320 idata = (char*)PyUnicode_DATA(input);
8321 kind = PyUnicode_KIND(input);
8322 size = PyUnicode_GET_LENGTH(input);
8323 i = 0;
8324
8325 if (size == 0) {
8326 Py_INCREF(input);
8327 return input;
8328 }
8329
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330 /* allocate enough for a simple 1:1 translation without
8331 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008332 osize = size;
8333 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8334 opos = 0;
8335 if (output == NULL) {
8336 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008338 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008340 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 /* try to encode it */
8342 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008343 if (charmaptranslate_output(input, i, mapping,
8344 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 Py_XDECREF(x);
8346 goto onError;
8347 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008348 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008350 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 else { /* untranslatable character */
8352 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8353 Py_ssize_t repsize;
8354 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008355 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008357 Py_ssize_t collstart = i;
8358 Py_ssize_t collend = i+1;
8359 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362 while (collend < size) {
8363 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 goto onError;
8365 Py_XDECREF(x);
8366 if (x!=Py_None)
8367 break;
8368 ++collend;
8369 }
8370 /* cache callback name lookup
8371 * (if not done yet, i.e. it's the first error) */
8372 if (known_errorHandler==-1) {
8373 if ((errors==NULL) || (!strcmp(errors, "strict")))
8374 known_errorHandler = 1;
8375 else if (!strcmp(errors, "replace"))
8376 known_errorHandler = 2;
8377 else if (!strcmp(errors, "ignore"))
8378 known_errorHandler = 3;
8379 else if (!strcmp(errors, "xmlcharrefreplace"))
8380 known_errorHandler = 4;
8381 else
8382 known_errorHandler = 0;
8383 }
8384 switch (known_errorHandler) {
8385 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008386 raise_translate_exception(&exc, input, collstart,
8387 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008388 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 case 2: /* replace */
8390 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 for (coll = collstart; coll<collend; coll++)
8392 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 /* fall through */
8394 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008395 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 break;
8397 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398 /* generate replacement (temporarily (mis)uses i) */
8399 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 char buffer[2+29+1+1];
8401 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8403 if (charmaptranslate_makespace(&output, &osize,
8404 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 goto onError;
8406 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008407 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008409 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 break;
8411 default:
8412 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008413 reason, input, &exc,
8414 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008415 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008417 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008418 Py_DECREF(repunicode);
8419 goto onError;
8420 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008422 repsize = PyUnicode_GET_LENGTH(repunicode);
8423 if (charmaptranslate_makespace(&output, &osize,
8424 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 Py_DECREF(repunicode);
8426 goto onError;
8427 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008428 for (uni2 = 0; repsize-->0; ++uni2)
8429 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8430 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008433 }
8434 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008435 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8436 if (!res)
8437 goto onError;
8438 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008439 Py_XDECREF(exc);
8440 Py_XDECREF(errorHandler);
8441 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008442
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008444 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008445 Py_XDECREF(exc);
8446 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008447 return NULL;
8448}
8449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008450/* Deprecated. Use PyUnicode_Translate instead. */
8451PyObject *
8452PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8453 Py_ssize_t size,
8454 PyObject *mapping,
8455 const char *errors)
8456{
Christian Heimes5f520f42012-09-11 14:03:25 +02008457 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008458 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8459 if (!unicode)
8460 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008461 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8462 Py_DECREF(unicode);
8463 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008464}
8465
Alexander Belopolsky40018472011-02-26 01:02:56 +00008466PyObject *
8467PyUnicode_Translate(PyObject *str,
8468 PyObject *mapping,
8469 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008470{
8471 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008472
Guido van Rossumd57fd912000-03-10 22:53:23 +00008473 str = PyUnicode_FromObject(str);
8474 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008475 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008476 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008477 Py_DECREF(str);
8478 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008479}
Tim Petersced69f82003-09-16 20:30:58 +00008480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008482fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008483{
8484 /* No need to call PyUnicode_READY(self) because this function is only
8485 called as a callback from fixup() which does it already. */
8486 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8487 const int kind = PyUnicode_KIND(self);
8488 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008489 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008490 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008491 Py_ssize_t i;
8492
8493 for (i = 0; i < len; ++i) {
8494 ch = PyUnicode_READ(kind, data, i);
8495 fixed = 0;
8496 if (ch > 127) {
8497 if (Py_UNICODE_ISSPACE(ch))
8498 fixed = ' ';
8499 else {
8500 const int decimal = Py_UNICODE_TODECIMAL(ch);
8501 if (decimal >= 0)
8502 fixed = '0' + decimal;
8503 }
8504 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008505 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008506 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 PyUnicode_WRITE(kind, data, i, fixed);
8508 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008509 else
8510 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008511 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008512 }
8513
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008514 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515}
8516
8517PyObject *
8518_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8519{
8520 if (!PyUnicode_Check(unicode)) {
8521 PyErr_BadInternalCall();
8522 return NULL;
8523 }
8524 if (PyUnicode_READY(unicode) == -1)
8525 return NULL;
8526 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8527 /* If the string is already ASCII, just return the same string */
8528 Py_INCREF(unicode);
8529 return unicode;
8530 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008531 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008532}
8533
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008534PyObject *
8535PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8536 Py_ssize_t length)
8537{
Victor Stinnerf0124502011-11-21 23:12:56 +01008538 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008539 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008540 Py_UCS4 maxchar;
8541 enum PyUnicode_Kind kind;
8542 void *data;
8543
Victor Stinner99d7ad02012-02-22 13:37:39 +01008544 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008545 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008546 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008547 if (ch > 127) {
8548 int decimal = Py_UNICODE_TODECIMAL(ch);
8549 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008550 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008551 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008552 }
8553 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008554
8555 /* Copy to a new string */
8556 decimal = PyUnicode_New(length, maxchar);
8557 if (decimal == NULL)
8558 return decimal;
8559 kind = PyUnicode_KIND(decimal);
8560 data = PyUnicode_DATA(decimal);
8561 /* Iterate over code points */
8562 for (i = 0; i < length; i++) {
8563 Py_UNICODE ch = s[i];
8564 if (ch > 127) {
8565 int decimal = Py_UNICODE_TODECIMAL(ch);
8566 if (decimal >= 0)
8567 ch = '0' + decimal;
8568 }
8569 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008571 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008572}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008573/* --- Decimal Encoder ---------------------------------------------------- */
8574
Alexander Belopolsky40018472011-02-26 01:02:56 +00008575int
8576PyUnicode_EncodeDecimal(Py_UNICODE *s,
8577 Py_ssize_t length,
8578 char *output,
8579 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008580{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008581 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008582 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008583 enum PyUnicode_Kind kind;
8584 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008585
8586 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008587 PyErr_BadArgument();
8588 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008589 }
8590
Victor Stinner42bf7752011-11-21 22:52:58 +01008591 unicode = PyUnicode_FromUnicode(s, length);
8592 if (unicode == NULL)
8593 return -1;
8594
Benjamin Petersonbac79492012-01-14 13:34:47 -05008595 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008596 Py_DECREF(unicode);
8597 return -1;
8598 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008599 kind = PyUnicode_KIND(unicode);
8600 data = PyUnicode_DATA(unicode);
8601
Victor Stinnerb84d7232011-11-22 01:50:07 +01008602 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008603 PyObject *exc;
8604 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008606 Py_ssize_t startpos;
8607
8608 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008609
Benjamin Peterson29060642009-01-31 22:14:21 +00008610 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008611 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008612 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008614 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 decimal = Py_UNICODE_TODECIMAL(ch);
8616 if (decimal >= 0) {
8617 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008618 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 continue;
8620 }
8621 if (0 < ch && ch < 256) {
8622 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008623 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008624 continue;
8625 }
Victor Stinner6345be92011-11-25 20:09:01 +01008626
Victor Stinner42bf7752011-11-21 22:52:58 +01008627 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008628 exc = NULL;
8629 raise_encode_exception(&exc, "decimal", unicode,
8630 startpos, startpos+1,
8631 "invalid decimal Unicode string");
8632 Py_XDECREF(exc);
8633 Py_DECREF(unicode);
8634 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008635 }
8636 /* 0-terminate the output string */
8637 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008638 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008639 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008640}
8641
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642/* --- Helpers ------------------------------------------------------------ */
8643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008645any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008646 Py_ssize_t start,
8647 Py_ssize_t end)
8648{
8649 int kind1, kind2, kind;
8650 void *buf1, *buf2;
8651 Py_ssize_t len1, len2, result;
8652
8653 kind1 = PyUnicode_KIND(s1);
8654 kind2 = PyUnicode_KIND(s2);
8655 kind = kind1 > kind2 ? kind1 : kind2;
8656 buf1 = PyUnicode_DATA(s1);
8657 buf2 = PyUnicode_DATA(s2);
8658 if (kind1 != kind)
8659 buf1 = _PyUnicode_AsKind(s1, kind);
8660 if (!buf1)
8661 return -2;
8662 if (kind2 != kind)
8663 buf2 = _PyUnicode_AsKind(s2, kind);
8664 if (!buf2) {
8665 if (kind1 != kind) PyMem_Free(buf1);
8666 return -2;
8667 }
8668 len1 = PyUnicode_GET_LENGTH(s1);
8669 len2 = PyUnicode_GET_LENGTH(s2);
8670
Victor Stinner794d5672011-10-10 03:21:36 +02008671 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008672 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008673 case PyUnicode_1BYTE_KIND:
8674 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8675 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8676 else
8677 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8678 break;
8679 case PyUnicode_2BYTE_KIND:
8680 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8681 break;
8682 case PyUnicode_4BYTE_KIND:
8683 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8684 break;
8685 default:
8686 assert(0); result = -2;
8687 }
8688 }
8689 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008690 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008691 case PyUnicode_1BYTE_KIND:
8692 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8693 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8694 else
8695 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8696 break;
8697 case PyUnicode_2BYTE_KIND:
8698 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8699 break;
8700 case PyUnicode_4BYTE_KIND:
8701 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8702 break;
8703 default:
8704 assert(0); result = -2;
8705 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706 }
8707
8708 if (kind1 != kind)
8709 PyMem_Free(buf1);
8710 if (kind2 != kind)
8711 PyMem_Free(buf2);
8712
8713 return result;
8714}
8715
8716Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008717_PyUnicode_InsertThousandsGrouping(
8718 PyObject *unicode, Py_ssize_t index,
8719 Py_ssize_t n_buffer,
8720 void *digits, Py_ssize_t n_digits,
8721 Py_ssize_t min_width,
8722 const char *grouping, PyObject *thousands_sep,
8723 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008724{
Victor Stinner41a863c2012-02-24 00:37:51 +01008725 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008726 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008727 Py_ssize_t thousands_sep_len;
8728 Py_ssize_t len;
8729
8730 if (unicode != NULL) {
8731 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008732 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008733 }
8734 else {
8735 kind = PyUnicode_1BYTE_KIND;
8736 data = NULL;
8737 }
8738 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8739 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8740 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8741 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008742 if (thousands_sep_kind < kind) {
8743 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8744 if (!thousands_sep_data)
8745 return -1;
8746 }
8747 else {
8748 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8749 if (!data)
8750 return -1;
8751 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008752 }
8753
Benjamin Petersonead6b532011-12-20 17:23:42 -06008754 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008756 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008757 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008758 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008759 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008760 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008761 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008762 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008763 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008764 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008765 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008766 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008768 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008769 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008770 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008771 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008772 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008774 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008775 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008776 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008777 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008778 break;
8779 default:
8780 assert(0);
8781 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008783 if (unicode != NULL && thousands_sep_kind != kind) {
8784 if (thousands_sep_kind < kind)
8785 PyMem_Free(thousands_sep_data);
8786 else
8787 PyMem_Free(data);
8788 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008789 if (unicode == NULL) {
8790 *maxchar = 127;
8791 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008792 *maxchar = MAX_MAXCHAR(*maxchar,
8793 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008794 }
8795 }
8796 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008797}
8798
8799
Thomas Wouters477c8d52006-05-27 19:21:47 +00008800/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008801#define ADJUST_INDICES(start, end, len) \
8802 if (end > len) \
8803 end = len; \
8804 else if (end < 0) { \
8805 end += len; \
8806 if (end < 0) \
8807 end = 0; \
8808 } \
8809 if (start < 0) { \
8810 start += len; \
8811 if (start < 0) \
8812 start = 0; \
8813 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008814
Alexander Belopolsky40018472011-02-26 01:02:56 +00008815Py_ssize_t
8816PyUnicode_Count(PyObject *str,
8817 PyObject *substr,
8818 Py_ssize_t start,
8819 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008820{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008821 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008822 PyObject* str_obj;
8823 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824 int kind1, kind2, kind;
8825 void *buf1 = NULL, *buf2 = NULL;
8826 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008827
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008828 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008829 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008831 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008832 if (!sub_obj) {
8833 Py_DECREF(str_obj);
8834 return -1;
8835 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008836 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008837 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008838 Py_DECREF(str_obj);
8839 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008840 }
Tim Petersced69f82003-09-16 20:30:58 +00008841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008842 kind1 = PyUnicode_KIND(str_obj);
8843 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008844 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008846 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008847 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008848 if (kind2 > kind) {
8849 Py_DECREF(sub_obj);
8850 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008851 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008852 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008853 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008854 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855 if (!buf2)
8856 goto onError;
8857 len1 = PyUnicode_GET_LENGTH(str_obj);
8858 len2 = PyUnicode_GET_LENGTH(sub_obj);
8859
8860 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008861 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008863 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8864 result = asciilib_count(
8865 ((Py_UCS1*)buf1) + start, end - start,
8866 buf2, len2, PY_SSIZE_T_MAX
8867 );
8868 else
8869 result = ucs1lib_count(
8870 ((Py_UCS1*)buf1) + start, end - start,
8871 buf2, len2, PY_SSIZE_T_MAX
8872 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873 break;
8874 case PyUnicode_2BYTE_KIND:
8875 result = ucs2lib_count(
8876 ((Py_UCS2*)buf1) + start, end - start,
8877 buf2, len2, PY_SSIZE_T_MAX
8878 );
8879 break;
8880 case PyUnicode_4BYTE_KIND:
8881 result = ucs4lib_count(
8882 ((Py_UCS4*)buf1) + start, end - start,
8883 buf2, len2, PY_SSIZE_T_MAX
8884 );
8885 break;
8886 default:
8887 assert(0); result = 0;
8888 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008889
8890 Py_DECREF(sub_obj);
8891 Py_DECREF(str_obj);
8892
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008893 if (kind2 != kind)
8894 PyMem_Free(buf2);
8895
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008897 onError:
8898 Py_DECREF(sub_obj);
8899 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 if (kind2 != kind && buf2)
8901 PyMem_Free(buf2);
8902 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008903}
8904
Alexander Belopolsky40018472011-02-26 01:02:56 +00008905Py_ssize_t
8906PyUnicode_Find(PyObject *str,
8907 PyObject *sub,
8908 Py_ssize_t start,
8909 Py_ssize_t end,
8910 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008912 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008913
Guido van Rossumd57fd912000-03-10 22:53:23 +00008914 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008915 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008917 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008918 if (!sub) {
8919 Py_DECREF(str);
8920 return -2;
8921 }
8922 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
8923 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00008924 Py_DECREF(str);
8925 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926 }
Tim Petersced69f82003-09-16 20:30:58 +00008927
Victor Stinner794d5672011-10-10 03:21:36 +02008928 result = any_find_slice(direction,
8929 str, sub, start, end
8930 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008931
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008933 Py_DECREF(sub);
8934
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935 return result;
8936}
8937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008938Py_ssize_t
8939PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8940 Py_ssize_t start, Py_ssize_t end,
8941 int direction)
8942{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008944 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008945 if (PyUnicode_READY(str) == -1)
8946 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008947 if (start < 0 || end < 0) {
8948 PyErr_SetString(PyExc_IndexError, "string index out of range");
8949 return -2;
8950 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008951 if (end > PyUnicode_GET_LENGTH(str))
8952 end = PyUnicode_GET_LENGTH(str);
8953 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008954 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8955 kind, end-start, ch, direction);
8956 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008957 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008958 else
8959 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960}
8961
Alexander Belopolsky40018472011-02-26 01:02:56 +00008962static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008963tailmatch(PyObject *self,
8964 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008965 Py_ssize_t start,
8966 Py_ssize_t end,
8967 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969 int kind_self;
8970 int kind_sub;
8971 void *data_self;
8972 void *data_sub;
8973 Py_ssize_t offset;
8974 Py_ssize_t i;
8975 Py_ssize_t end_sub;
8976
8977 if (PyUnicode_READY(self) == -1 ||
8978 PyUnicode_READY(substring) == -1)
8979 return 0;
8980
8981 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008982 return 1;
8983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8985 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008986 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008987 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008989 kind_self = PyUnicode_KIND(self);
8990 data_self = PyUnicode_DATA(self);
8991 kind_sub = PyUnicode_KIND(substring);
8992 data_sub = PyUnicode_DATA(substring);
8993 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8994
8995 if (direction > 0)
8996 offset = end;
8997 else
8998 offset = start;
8999
9000 if (PyUnicode_READ(kind_self, data_self, offset) ==
9001 PyUnicode_READ(kind_sub, data_sub, 0) &&
9002 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9003 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9004 /* If both are of the same kind, memcmp is sufficient */
9005 if (kind_self == kind_sub) {
9006 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009007 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009008 data_sub,
9009 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009010 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 }
9012 /* otherwise we have to compare each character by first accesing it */
9013 else {
9014 /* We do not need to compare 0 and len(substring)-1 because
9015 the if statement above ensured already that they are equal
9016 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02009017 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018 for (i = 1; i < end_sub; ++i) {
9019 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9020 PyUnicode_READ(kind_sub, data_sub, i))
9021 return 0;
9022 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009023 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009024 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025 }
9026
9027 return 0;
9028}
9029
Alexander Belopolsky40018472011-02-26 01:02:56 +00009030Py_ssize_t
9031PyUnicode_Tailmatch(PyObject *str,
9032 PyObject *substr,
9033 Py_ssize_t start,
9034 Py_ssize_t end,
9035 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009036{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009037 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009038
Guido van Rossumd57fd912000-03-10 22:53:23 +00009039 str = PyUnicode_FromObject(str);
9040 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009041 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042 substr = PyUnicode_FromObject(substr);
9043 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009044 Py_DECREF(str);
9045 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009046 }
Tim Petersced69f82003-09-16 20:30:58 +00009047
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009048 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009049 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009050 Py_DECREF(str);
9051 Py_DECREF(substr);
9052 return result;
9053}
9054
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055/* Apply fixfct filter to the Unicode object self and return a
9056 reference to the modified object */
9057
Alexander Belopolsky40018472011-02-26 01:02:56 +00009058static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009059fixup(PyObject *self,
9060 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062 PyObject *u;
9063 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009064 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009066 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009068 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009069 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071 /* fix functions return the new maximum character in a string,
9072 if the kind of the resulting unicode object does not change,
9073 everything is fine. Otherwise we need to change the string kind
9074 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009075 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009076
9077 if (maxchar_new == 0) {
9078 /* no changes */;
9079 if (PyUnicode_CheckExact(self)) {
9080 Py_DECREF(u);
9081 Py_INCREF(self);
9082 return self;
9083 }
9084 else
9085 return u;
9086 }
9087
Victor Stinnere6abb482012-05-02 01:15:40 +02009088 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009089
Victor Stinnereaab6042011-12-11 22:22:39 +01009090 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009092
9093 /* In case the maximum character changed, we need to
9094 convert the string to the new category. */
9095 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9096 if (v == NULL) {
9097 Py_DECREF(u);
9098 return NULL;
9099 }
9100 if (maxchar_new > maxchar_old) {
9101 /* If the maxchar increased so that the kind changed, not all
9102 characters are representable anymore and we need to fix the
9103 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009104 _PyUnicode_FastCopyCharacters(v, 0,
9105 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009106 maxchar_old = fixfct(v);
9107 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009108 }
9109 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009110 _PyUnicode_FastCopyCharacters(v, 0,
9111 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009113 Py_DECREF(u);
9114 assert(_PyUnicode_CheckConsistency(v, 1));
9115 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116}
9117
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009118static PyObject *
9119ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009120{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009121 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9122 char *resdata, *data = PyUnicode_DATA(self);
9123 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009124
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009125 res = PyUnicode_New(len, 127);
9126 if (res == NULL)
9127 return NULL;
9128 resdata = PyUnicode_DATA(res);
9129 if (lower)
9130 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009131 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009132 _Py_bytes_upper(resdata, data, len);
9133 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009134}
9135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009137handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009138{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009139 Py_ssize_t j;
9140 int final_sigma;
9141 Py_UCS4 c;
9142 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009143
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009144 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9145
9146 where ! is a negation and \p{xxx} is a character with property xxx.
9147 */
9148 for (j = i - 1; j >= 0; j--) {
9149 c = PyUnicode_READ(kind, data, j);
9150 if (!_PyUnicode_IsCaseIgnorable(c))
9151 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009153 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9154 if (final_sigma) {
9155 for (j = i + 1; j < length; j++) {
9156 c = PyUnicode_READ(kind, data, j);
9157 if (!_PyUnicode_IsCaseIgnorable(c))
9158 break;
9159 }
9160 final_sigma = j == length || !_PyUnicode_IsCased(c);
9161 }
9162 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009163}
9164
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009165static int
9166lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9167 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009168{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009169 /* Obscure special case. */
9170 if (c == 0x3A3) {
9171 mapped[0] = handle_capital_sigma(kind, data, length, i);
9172 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009174 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009175}
9176
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009177static Py_ssize_t
9178do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009179{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009180 Py_ssize_t i, k = 0;
9181 int n_res, j;
9182 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009183
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009184 c = PyUnicode_READ(kind, data, 0);
9185 n_res = _PyUnicode_ToUpperFull(c, mapped);
9186 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009187 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009188 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009189 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009190 for (i = 1; i < length; i++) {
9191 c = PyUnicode_READ(kind, data, i);
9192 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9193 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009194 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009195 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009196 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009197 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009198 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009199}
9200
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009201static Py_ssize_t
9202do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9203 Py_ssize_t i, k = 0;
9204
9205 for (i = 0; i < length; i++) {
9206 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9207 int n_res, j;
9208 if (Py_UNICODE_ISUPPER(c)) {
9209 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9210 }
9211 else if (Py_UNICODE_ISLOWER(c)) {
9212 n_res = _PyUnicode_ToUpperFull(c, mapped);
9213 }
9214 else {
9215 n_res = 1;
9216 mapped[0] = c;
9217 }
9218 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009219 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009220 res[k++] = mapped[j];
9221 }
9222 }
9223 return k;
9224}
9225
9226static Py_ssize_t
9227do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9228 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009229{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009230 Py_ssize_t i, k = 0;
9231
9232 for (i = 0; i < length; i++) {
9233 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9234 int n_res, j;
9235 if (lower)
9236 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9237 else
9238 n_res = _PyUnicode_ToUpperFull(c, mapped);
9239 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009240 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009241 res[k++] = mapped[j];
9242 }
9243 }
9244 return k;
9245}
9246
9247static Py_ssize_t
9248do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9249{
9250 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9251}
9252
9253static Py_ssize_t
9254do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9255{
9256 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9257}
9258
Benjamin Petersone51757f2012-01-12 21:10:29 -05009259static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009260do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9261{
9262 Py_ssize_t i, k = 0;
9263
9264 for (i = 0; i < length; i++) {
9265 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9266 Py_UCS4 mapped[3];
9267 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9268 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009269 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009270 res[k++] = mapped[j];
9271 }
9272 }
9273 return k;
9274}
9275
9276static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009277do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9278{
9279 Py_ssize_t i, k = 0;
9280 int previous_is_cased;
9281
9282 previous_is_cased = 0;
9283 for (i = 0; i < length; i++) {
9284 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9285 Py_UCS4 mapped[3];
9286 int n_res, j;
9287
9288 if (previous_is_cased)
9289 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9290 else
9291 n_res = _PyUnicode_ToTitleFull(c, mapped);
9292
9293 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009294 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009295 res[k++] = mapped[j];
9296 }
9297
9298 previous_is_cased = _PyUnicode_IsCased(c);
9299 }
9300 return k;
9301}
9302
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009303static PyObject *
9304case_operation(PyObject *self,
9305 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9306{
9307 PyObject *res = NULL;
9308 Py_ssize_t length, newlength = 0;
9309 int kind, outkind;
9310 void *data, *outdata;
9311 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9312
Benjamin Petersoneea48462012-01-16 14:28:50 -05009313 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009314
9315 kind = PyUnicode_KIND(self);
9316 data = PyUnicode_DATA(self);
9317 length = PyUnicode_GET_LENGTH(self);
9318 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9319 if (tmp == NULL)
9320 return PyErr_NoMemory();
9321 newlength = perform(kind, data, length, tmp, &maxchar);
9322 res = PyUnicode_New(newlength, maxchar);
9323 if (res == NULL)
9324 goto leave;
9325 tmpend = tmp + newlength;
9326 outdata = PyUnicode_DATA(res);
9327 outkind = PyUnicode_KIND(res);
9328 switch (outkind) {
9329 case PyUnicode_1BYTE_KIND:
9330 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9331 break;
9332 case PyUnicode_2BYTE_KIND:
9333 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9334 break;
9335 case PyUnicode_4BYTE_KIND:
9336 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9337 break;
9338 default:
9339 assert(0);
9340 break;
9341 }
9342 leave:
9343 PyMem_FREE(tmp);
9344 return res;
9345}
9346
Tim Peters8ce9f162004-08-27 01:49:32 +00009347PyObject *
9348PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009351 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009353 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009354 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9355 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009356 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009357 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009358 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009360 int use_memcpy;
9361 unsigned char *res_data = NULL, *sep_data = NULL;
9362 PyObject *last_obj;
9363 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009364
Tim Peters05eba1f2004-08-27 21:32:02 +00009365 fseq = PySequence_Fast(seq, "");
9366 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009367 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009368 }
9369
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009370 /* NOTE: the following code can't call back into Python code,
9371 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009372 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009373
Tim Peters05eba1f2004-08-27 21:32:02 +00009374 seqlen = PySequence_Fast_GET_SIZE(fseq);
9375 /* If empty sequence, return u"". */
9376 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009377 Py_DECREF(fseq);
9378 Py_INCREF(unicode_empty);
9379 res = unicode_empty;
9380 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009381 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009382
Tim Peters05eba1f2004-08-27 21:32:02 +00009383 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009384 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009385 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009386 if (seqlen == 1) {
9387 if (PyUnicode_CheckExact(items[0])) {
9388 res = items[0];
9389 Py_INCREF(res);
9390 Py_DECREF(fseq);
9391 return res;
9392 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009393 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009394 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009395 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009396 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009397 /* Set up sep and seplen */
9398 if (separator == NULL) {
9399 /* fall back to a blank space separator */
9400 sep = PyUnicode_FromOrdinal(' ');
9401 if (!sep)
9402 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009403 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009404 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009405 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009406 else {
9407 if (!PyUnicode_Check(separator)) {
9408 PyErr_Format(PyExc_TypeError,
9409 "separator: expected str instance,"
9410 " %.80s found",
9411 Py_TYPE(separator)->tp_name);
9412 goto onError;
9413 }
9414 if (PyUnicode_READY(separator))
9415 goto onError;
9416 sep = separator;
9417 seplen = PyUnicode_GET_LENGTH(separator);
9418 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9419 /* inc refcount to keep this code path symmetric with the
9420 above case of a blank separator */
9421 Py_INCREF(sep);
9422 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009423 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009424 }
9425
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009426 /* There are at least two things to join, or else we have a subclass
9427 * of str in the sequence.
9428 * Do a pre-pass to figure out the total amount of space we'll
9429 * need (sz), and see whether all argument are strings.
9430 */
9431 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009432#ifdef Py_DEBUG
9433 use_memcpy = 0;
9434#else
9435 use_memcpy = 1;
9436#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009437 for (i = 0; i < seqlen; i++) {
9438 const Py_ssize_t old_sz = sz;
9439 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009440 if (!PyUnicode_Check(item)) {
9441 PyErr_Format(PyExc_TypeError,
9442 "sequence item %zd: expected str instance,"
9443 " %.80s found",
9444 i, Py_TYPE(item)->tp_name);
9445 goto onError;
9446 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 if (PyUnicode_READY(item) == -1)
9448 goto onError;
9449 sz += PyUnicode_GET_LENGTH(item);
9450 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009451 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009452 if (i != 0)
9453 sz += seplen;
9454 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9455 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009456 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009457 goto onError;
9458 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009459 if (use_memcpy && last_obj != NULL) {
9460 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9461 use_memcpy = 0;
9462 }
9463 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009464 }
Tim Petersced69f82003-09-16 20:30:58 +00009465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009467 if (res == NULL)
9468 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009469
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009470 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009471#ifdef Py_DEBUG
9472 use_memcpy = 0;
9473#else
9474 if (use_memcpy) {
9475 res_data = PyUnicode_1BYTE_DATA(res);
9476 kind = PyUnicode_KIND(res);
9477 if (seplen != 0)
9478 sep_data = PyUnicode_1BYTE_DATA(sep);
9479 }
9480#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009481 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009482 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009483 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009484 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009485 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009486 if (use_memcpy) {
9487 Py_MEMCPY(res_data,
9488 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009489 kind * seplen);
9490 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009491 }
9492 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009493 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009494 res_offset += seplen;
9495 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009496 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009497 itemlen = PyUnicode_GET_LENGTH(item);
9498 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009499 if (use_memcpy) {
9500 Py_MEMCPY(res_data,
9501 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009502 kind * itemlen);
9503 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009504 }
9505 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009506 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009507 res_offset += itemlen;
9508 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009509 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009510 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009511 if (use_memcpy)
9512 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009513 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009514 else
9515 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009516
Tim Peters05eba1f2004-08-27 21:32:02 +00009517 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009519 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009521
Benjamin Peterson29060642009-01-31 22:14:21 +00009522 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009523 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009525 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009526 return NULL;
9527}
9528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009529#define FILL(kind, data, value, start, length) \
9530 do { \
9531 Py_ssize_t i_ = 0; \
9532 assert(kind != PyUnicode_WCHAR_KIND); \
9533 switch ((kind)) { \
9534 case PyUnicode_1BYTE_KIND: { \
9535 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009536 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 break; \
9538 } \
9539 case PyUnicode_2BYTE_KIND: { \
9540 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9541 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9542 break; \
9543 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009544 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9546 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9547 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009548 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 } \
9550 } \
9551 } while (0)
9552
Victor Stinnerd3f08822012-05-29 12:57:52 +02009553void
9554_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9555 Py_UCS4 fill_char)
9556{
9557 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9558 const void *data = PyUnicode_DATA(unicode);
9559 assert(PyUnicode_IS_READY(unicode));
9560 assert(unicode_modifiable(unicode));
9561 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9562 assert(start >= 0);
9563 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9564 FILL(kind, data, fill_char, start, length);
9565}
9566
Victor Stinner3fe55312012-01-04 00:33:50 +01009567Py_ssize_t
9568PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9569 Py_UCS4 fill_char)
9570{
9571 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009572
9573 if (!PyUnicode_Check(unicode)) {
9574 PyErr_BadInternalCall();
9575 return -1;
9576 }
9577 if (PyUnicode_READY(unicode) == -1)
9578 return -1;
9579 if (unicode_check_modifiable(unicode))
9580 return -1;
9581
Victor Stinnerd3f08822012-05-29 12:57:52 +02009582 if (start < 0) {
9583 PyErr_SetString(PyExc_IndexError, "string index out of range");
9584 return -1;
9585 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009586 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9587 PyErr_SetString(PyExc_ValueError,
9588 "fill character is bigger than "
9589 "the string maximum character");
9590 return -1;
9591 }
9592
9593 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9594 length = Py_MIN(maxlen, length);
9595 if (length <= 0)
9596 return 0;
9597
Victor Stinnerd3f08822012-05-29 12:57:52 +02009598 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009599 return length;
9600}
9601
Victor Stinner9310abb2011-10-05 00:59:23 +02009602static PyObject *
9603pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009604 Py_ssize_t left,
9605 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009607{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608 PyObject *u;
9609 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009610 int kind;
9611 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009612
9613 if (left < 0)
9614 left = 0;
9615 if (right < 0)
9616 right = 0;
9617
Victor Stinnerc4b49542011-12-11 22:44:26 +01009618 if (left == 0 && right == 0)
9619 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009620
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009621 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9622 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009623 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9624 return NULL;
9625 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009627 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009629 if (!u)
9630 return NULL;
9631
9632 kind = PyUnicode_KIND(u);
9633 data = PyUnicode_DATA(u);
9634 if (left)
9635 FILL(kind, data, fill, 0, left);
9636 if (right)
9637 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009638 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009639 assert(_PyUnicode_CheckConsistency(u, 1));
9640 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009641}
9642
Alexander Belopolsky40018472011-02-26 01:02:56 +00009643PyObject *
9644PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009645{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009646 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647
9648 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009649 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009650 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009651 if (PyUnicode_READY(string) == -1) {
9652 Py_DECREF(string);
9653 return NULL;
9654 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009655
Benjamin Petersonead6b532011-12-20 17:23:42 -06009656 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009658 if (PyUnicode_IS_ASCII(string))
9659 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009660 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009661 PyUnicode_GET_LENGTH(string), keepends);
9662 else
9663 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009664 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009665 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009666 break;
9667 case PyUnicode_2BYTE_KIND:
9668 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009669 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009670 PyUnicode_GET_LENGTH(string), keepends);
9671 break;
9672 case PyUnicode_4BYTE_KIND:
9673 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009674 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675 PyUnicode_GET_LENGTH(string), keepends);
9676 break;
9677 default:
9678 assert(0);
9679 list = 0;
9680 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681 Py_DECREF(string);
9682 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683}
9684
Alexander Belopolsky40018472011-02-26 01:02:56 +00009685static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009686split(PyObject *self,
9687 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009688 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009690 int kind1, kind2, kind;
9691 void *buf1, *buf2;
9692 Py_ssize_t len1, len2;
9693 PyObject* out;
9694
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009696 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009698 if (PyUnicode_READY(self) == -1)
9699 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009701 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009702 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009704 if (PyUnicode_IS_ASCII(self))
9705 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009706 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009707 PyUnicode_GET_LENGTH(self), maxcount
9708 );
9709 else
9710 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009711 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009712 PyUnicode_GET_LENGTH(self), maxcount
9713 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009714 case PyUnicode_2BYTE_KIND:
9715 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009716 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717 PyUnicode_GET_LENGTH(self), maxcount
9718 );
9719 case PyUnicode_4BYTE_KIND:
9720 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009721 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 PyUnicode_GET_LENGTH(self), maxcount
9723 );
9724 default:
9725 assert(0);
9726 return NULL;
9727 }
9728
9729 if (PyUnicode_READY(substring) == -1)
9730 return NULL;
9731
9732 kind1 = PyUnicode_KIND(self);
9733 kind2 = PyUnicode_KIND(substring);
9734 kind = kind1 > kind2 ? kind1 : kind2;
9735 buf1 = PyUnicode_DATA(self);
9736 buf2 = PyUnicode_DATA(substring);
9737 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009738 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 if (!buf1)
9740 return NULL;
9741 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009742 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009743 if (!buf2) {
9744 if (kind1 != kind) PyMem_Free(buf1);
9745 return NULL;
9746 }
9747 len1 = PyUnicode_GET_LENGTH(self);
9748 len2 = PyUnicode_GET_LENGTH(substring);
9749
Benjamin Petersonead6b532011-12-20 17:23:42 -06009750 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009751 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009752 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9753 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009754 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009755 else
9756 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009757 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009758 break;
9759 case PyUnicode_2BYTE_KIND:
9760 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009761 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009762 break;
9763 case PyUnicode_4BYTE_KIND:
9764 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009765 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009766 break;
9767 default:
9768 out = NULL;
9769 }
9770 if (kind1 != kind)
9771 PyMem_Free(buf1);
9772 if (kind2 != kind)
9773 PyMem_Free(buf2);
9774 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009775}
9776
Alexander Belopolsky40018472011-02-26 01:02:56 +00009777static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009778rsplit(PyObject *self,
9779 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009780 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009781{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 int kind1, kind2, kind;
9783 void *buf1, *buf2;
9784 Py_ssize_t len1, len2;
9785 PyObject* out;
9786
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009787 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009788 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009790 if (PyUnicode_READY(self) == -1)
9791 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009793 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009794 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009795 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009796 if (PyUnicode_IS_ASCII(self))
9797 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009798 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009799 PyUnicode_GET_LENGTH(self), maxcount
9800 );
9801 else
9802 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009803 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009804 PyUnicode_GET_LENGTH(self), maxcount
9805 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009806 case PyUnicode_2BYTE_KIND:
9807 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009808 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009809 PyUnicode_GET_LENGTH(self), maxcount
9810 );
9811 case PyUnicode_4BYTE_KIND:
9812 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009813 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009814 PyUnicode_GET_LENGTH(self), maxcount
9815 );
9816 default:
9817 assert(0);
9818 return NULL;
9819 }
9820
9821 if (PyUnicode_READY(substring) == -1)
9822 return NULL;
9823
9824 kind1 = PyUnicode_KIND(self);
9825 kind2 = PyUnicode_KIND(substring);
9826 kind = kind1 > kind2 ? kind1 : kind2;
9827 buf1 = PyUnicode_DATA(self);
9828 buf2 = PyUnicode_DATA(substring);
9829 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009830 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 if (!buf1)
9832 return NULL;
9833 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009834 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009835 if (!buf2) {
9836 if (kind1 != kind) PyMem_Free(buf1);
9837 return NULL;
9838 }
9839 len1 = PyUnicode_GET_LENGTH(self);
9840 len2 = PyUnicode_GET_LENGTH(substring);
9841
Benjamin Petersonead6b532011-12-20 17:23:42 -06009842 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009844 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9845 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009846 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009847 else
9848 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009849 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850 break;
9851 case PyUnicode_2BYTE_KIND:
9852 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009853 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 break;
9855 case PyUnicode_4BYTE_KIND:
9856 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009857 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009858 break;
9859 default:
9860 out = NULL;
9861 }
9862 if (kind1 != kind)
9863 PyMem_Free(buf1);
9864 if (kind2 != kind)
9865 PyMem_Free(buf2);
9866 return out;
9867}
9868
9869static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009870anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9871 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009873 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009874 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009875 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9876 return asciilib_find(buf1, len1, buf2, len2, offset);
9877 else
9878 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009879 case PyUnicode_2BYTE_KIND:
9880 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9881 case PyUnicode_4BYTE_KIND:
9882 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9883 }
9884 assert(0);
9885 return -1;
9886}
9887
9888static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009889anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9890 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -06009892 switch (kind) {
9893 case PyUnicode_1BYTE_KIND:
9894 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9895 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9896 else
9897 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9898 case PyUnicode_2BYTE_KIND:
9899 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9900 case PyUnicode_4BYTE_KIND:
9901 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9902 }
9903 assert(0);
9904 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009905}
9906
Alexander Belopolsky40018472011-02-26 01:02:56 +00009907static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908replace(PyObject *self, PyObject *str1,
9909 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009910{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009911 PyObject *u;
9912 char *sbuf = PyUnicode_DATA(self);
9913 char *buf1 = PyUnicode_DATA(str1);
9914 char *buf2 = PyUnicode_DATA(str2);
9915 int srelease = 0, release1 = 0, release2 = 0;
9916 int skind = PyUnicode_KIND(self);
9917 int kind1 = PyUnicode_KIND(str1);
9918 int kind2 = PyUnicode_KIND(str2);
9919 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9920 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9921 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009922 int mayshrink;
9923 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924
9925 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009926 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009927 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009928 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929
Victor Stinner59de0ee2011-10-07 10:01:28 +02009930 if (str1 == str2)
9931 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 if (skind < kind1)
9933 /* substring too wide to be present */
9934 goto nothing;
9935
Victor Stinner49a0a212011-10-12 23:46:10 +02009936 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9937 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9938 /* Replacing str1 with str2 may cause a maxchar reduction in the
9939 result string. */
9940 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +02009941 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009944 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009946 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009948 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009949 Py_UCS4 u1, u2;
9950 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +01009951 Py_ssize_t index, pos;
9952 char *src;
9953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +01009955 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
9956 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009957 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009958 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009960 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +02009962 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +01009964
9965 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
9966 index = 0;
9967 src = sbuf;
9968 while (--maxcount)
9969 {
9970 pos++;
9971 src += pos * PyUnicode_KIND(self);
9972 slen -= pos;
9973 index += pos;
9974 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
9975 if (pos < 0)
9976 break;
9977 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
9978 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009979 }
9980 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981 int rkind = skind;
9982 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +01009983 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +02009984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009985 if (kind1 < rkind) {
9986 /* widen substring */
9987 buf1 = _PyUnicode_AsKind(str1, rkind);
9988 if (!buf1) goto error;
9989 release1 = 1;
9990 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009991 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009992 if (i < 0)
9993 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009994 if (rkind > kind2) {
9995 /* widen replacement */
9996 buf2 = _PyUnicode_AsKind(str2, rkind);
9997 if (!buf2) goto error;
9998 release2 = 1;
9999 }
10000 else if (rkind < kind2) {
10001 /* widen self and buf1 */
10002 rkind = kind2;
10003 if (release1) PyMem_Free(buf1);
10004 sbuf = _PyUnicode_AsKind(self, rkind);
10005 if (!sbuf) goto error;
10006 srelease = 1;
10007 buf1 = _PyUnicode_AsKind(str1, rkind);
10008 if (!buf1) goto error;
10009 release1 = 1;
10010 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010011 u = PyUnicode_New(slen, maxchar);
10012 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010014 assert(PyUnicode_KIND(u) == rkind);
10015 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010016
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010017 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010018 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010019 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010021 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010023
10024 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010025 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010026 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010027 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010028 if (i == -1)
10029 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010030 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010032 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010034 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010036 }
10037 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010039 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 int rkind = skind;
10041 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010044 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 buf1 = _PyUnicode_AsKind(str1, rkind);
10046 if (!buf1) goto error;
10047 release1 = 1;
10048 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010049 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010050 if (n == 0)
10051 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010053 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010054 buf2 = _PyUnicode_AsKind(str2, rkind);
10055 if (!buf2) goto error;
10056 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010057 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010059 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 rkind = kind2;
10061 sbuf = _PyUnicode_AsKind(self, rkind);
10062 if (!sbuf) goto error;
10063 srelease = 1;
10064 if (release1) PyMem_Free(buf1);
10065 buf1 = _PyUnicode_AsKind(str1, rkind);
10066 if (!buf1) goto error;
10067 release1 = 1;
10068 }
10069 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10070 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010071 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 PyErr_SetString(PyExc_OverflowError,
10073 "replace string is too long");
10074 goto error;
10075 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010076 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010077 if (new_size == 0) {
10078 Py_INCREF(unicode_empty);
10079 u = unicode_empty;
10080 goto done;
10081 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010082 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 PyErr_SetString(PyExc_OverflowError,
10084 "replace string is too long");
10085 goto error;
10086 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010087 u = PyUnicode_New(new_size, maxchar);
10088 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010090 assert(PyUnicode_KIND(u) == rkind);
10091 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 ires = i = 0;
10093 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010094 while (n-- > 0) {
10095 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010096 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010097 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010098 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010099 if (j == -1)
10100 break;
10101 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010102 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010103 memcpy(res + rkind * ires,
10104 sbuf + rkind * i,
10105 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010107 }
10108 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010110 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010112 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010116 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010118 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010119 memcpy(res + rkind * ires,
10120 sbuf + rkind * i,
10121 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010122 }
10123 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010124 /* interleave */
10125 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010126 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010128 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010130 if (--n <= 0)
10131 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010132 memcpy(res + rkind * ires,
10133 sbuf + rkind * i,
10134 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 ires++;
10136 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010137 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010138 memcpy(res + rkind * ires,
10139 sbuf + rkind * i,
10140 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010141 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010142 }
10143
10144 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010145 unicode_adjust_maxchar(&u);
10146 if (u == NULL)
10147 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010148 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010149
10150 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 if (srelease)
10152 PyMem_FREE(sbuf);
10153 if (release1)
10154 PyMem_FREE(buf1);
10155 if (release2)
10156 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010157 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010159
Benjamin Peterson29060642009-01-31 22:14:21 +000010160 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010161 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 if (srelease)
10163 PyMem_FREE(sbuf);
10164 if (release1)
10165 PyMem_FREE(buf1);
10166 if (release2)
10167 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010168 return unicode_result_unchanged(self);
10169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 error:
10171 if (srelease && sbuf)
10172 PyMem_FREE(sbuf);
10173 if (release1 && buf1)
10174 PyMem_FREE(buf1);
10175 if (release2 && buf2)
10176 PyMem_FREE(buf2);
10177 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178}
10179
10180/* --- Unicode Object Methods --------------------------------------------- */
10181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010182PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010183 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010184\n\
10185Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010186characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010187
10188static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010189unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010190{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010191 if (PyUnicode_READY(self) == -1)
10192 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010193 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194}
10195
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010196PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010197 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010198\n\
10199Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010200have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201
10202static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010203unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010205 if (PyUnicode_READY(self) == -1)
10206 return NULL;
10207 if (PyUnicode_GET_LENGTH(self) == 0)
10208 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010209 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210}
10211
Benjamin Petersond5890c82012-01-14 13:23:30 -050010212PyDoc_STRVAR(casefold__doc__,
10213 "S.casefold() -> str\n\
10214\n\
10215Return a version of S suitable for caseless comparisons.");
10216
10217static PyObject *
10218unicode_casefold(PyObject *self)
10219{
10220 if (PyUnicode_READY(self) == -1)
10221 return NULL;
10222 if (PyUnicode_IS_ASCII(self))
10223 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010224 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010225}
10226
10227
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010228/* Argument converter. Coerces to a single unicode character */
10229
10230static int
10231convert_uc(PyObject *obj, void *addr)
10232{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010234 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010235
Benjamin Peterson14339b62009-01-31 16:36:08 +000010236 uniobj = PyUnicode_FromObject(obj);
10237 if (uniobj == NULL) {
10238 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010239 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010240 return 0;
10241 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010243 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010244 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010245 Py_DECREF(uniobj);
10246 return 0;
10247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010249 Py_DECREF(uniobj);
10250 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010251}
10252
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010253PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010254 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010256Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010257done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258
10259static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010260unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010262 Py_ssize_t marg, left;
10263 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 Py_UCS4 fillchar = ' ';
10265
Victor Stinnere9a29352011-10-01 02:14:59 +020010266 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268
Benjamin Petersonbac79492012-01-14 13:34:47 -050010269 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270 return NULL;
10271
Victor Stinnerc4b49542011-12-11 22:44:26 +010010272 if (PyUnicode_GET_LENGTH(self) >= width)
10273 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274
Victor Stinnerc4b49542011-12-11 22:44:26 +010010275 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276 left = marg / 2 + (marg & width & 1);
10277
Victor Stinner9310abb2011-10-05 00:59:23 +020010278 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010279}
10280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281/* This function assumes that str1 and str2 are readied by the caller. */
10282
Marc-André Lemburge5034372000-08-08 08:04:29 +000010283static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010284unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010285{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 int kind1, kind2;
10287 void *data1, *data2;
Victor Stinner770e19e2012-10-04 22:59:45 +020010288 Py_ssize_t len1, len2;
10289 Py_ssize_t i, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010290
Victor Stinner90db9c42012-10-04 21:53:50 +020010291 /* a string is equal to itself */
10292 if (str1 == str2)
10293 return 0;
10294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 kind1 = PyUnicode_KIND(str1);
10296 kind2 = PyUnicode_KIND(str2);
10297 data1 = PyUnicode_DATA(str1);
10298 data2 = PyUnicode_DATA(str2);
10299 len1 = PyUnicode_GET_LENGTH(str1);
10300 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010301 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010302
Victor Stinner770e19e2012-10-04 22:59:45 +020010303 if (kind1 == 1 && kind2 == 1) {
10304 int cmp = memcmp(data1, data2, len);
10305 /* normalize result of memcmp() into the range [-1; 1] */
10306 if (cmp < 0)
10307 return -1;
10308 if (cmp > 0)
10309 return 1;
10310 }
10311 else {
10312 for (i = 0; i < len; ++i) {
10313 Py_UCS4 c1, c2;
10314 c1 = PyUnicode_READ(kind1, data1, i);
10315 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010316
Victor Stinner770e19e2012-10-04 22:59:45 +020010317 if (c1 != c2)
10318 return (c1 < c2) ? -1 : 1;
10319 }
Marc-André Lemburge5034372000-08-08 08:04:29 +000010320 }
10321
Victor Stinner770e19e2012-10-04 22:59:45 +020010322 if (len1 == len2)
10323 return 0;
10324 if (len1 < len2)
10325 return -1;
10326 else
10327 return 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010328}
10329
Alexander Belopolsky40018472011-02-26 01:02:56 +000010330int
10331PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10334 if (PyUnicode_READY(left) == -1 ||
10335 PyUnicode_READY(right) == -1)
10336 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010337 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010339 PyErr_Format(PyExc_TypeError,
10340 "Can't compare %.100s and %.100s",
10341 left->ob_type->tp_name,
10342 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010343 return -1;
10344}
10345
Martin v. Löwis5b222132007-06-10 09:51:05 +000010346int
10347PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10348{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 Py_ssize_t i;
10350 int kind;
10351 void *data;
10352 Py_UCS4 chr;
10353
Victor Stinner910337b2011-10-03 03:20:16 +020010354 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 if (PyUnicode_READY(uni) == -1)
10356 return -1;
10357 kind = PyUnicode_KIND(uni);
10358 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010359 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10361 if (chr != str[i])
10362 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010363 /* This check keeps Python strings that end in '\0' from comparing equal
10364 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010366 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010367 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010368 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010369 return 0;
10370}
10371
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010372
Benjamin Peterson29060642009-01-31 22:14:21 +000010373#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010374 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010375
Alexander Belopolsky40018472011-02-26 01:02:56 +000010376PyObject *
10377PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010378{
10379 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010380
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010381 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10382 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 if (PyUnicode_READY(left) == -1 ||
10384 PyUnicode_READY(right) == -1)
10385 return NULL;
10386 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10387 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010388 if (op == Py_EQ) {
10389 Py_INCREF(Py_False);
10390 return Py_False;
10391 }
10392 if (op == Py_NE) {
10393 Py_INCREF(Py_True);
10394 return Py_True;
10395 }
10396 }
Victor Stinner90db9c42012-10-04 21:53:50 +020010397 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010398
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010399 /* Convert the return value to a Boolean */
10400 switch (op) {
10401 case Py_EQ:
10402 v = TEST_COND(result == 0);
10403 break;
10404 case Py_NE:
10405 v = TEST_COND(result != 0);
10406 break;
10407 case Py_LE:
10408 v = TEST_COND(result <= 0);
10409 break;
10410 case Py_GE:
10411 v = TEST_COND(result >= 0);
10412 break;
10413 case Py_LT:
10414 v = TEST_COND(result == -1);
10415 break;
10416 case Py_GT:
10417 v = TEST_COND(result == 1);
10418 break;
10419 default:
10420 PyErr_BadArgument();
10421 return NULL;
10422 }
10423 Py_INCREF(v);
10424 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010425 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010426
Brian Curtindfc80e32011-08-10 20:28:54 -050010427 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010428}
10429
Alexander Belopolsky40018472011-02-26 01:02:56 +000010430int
10431PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010432{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010433 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434 int kind1, kind2, kind;
10435 void *buf1, *buf2;
10436 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010437 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010438
10439 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010440 sub = PyUnicode_FromObject(element);
10441 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010442 PyErr_Format(PyExc_TypeError,
10443 "'in <string>' requires string as left operand, not %s",
10444 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010445 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010446 }
10447
Thomas Wouters477c8d52006-05-27 19:21:47 +000010448 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010449 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010450 Py_DECREF(sub);
10451 return -1;
10452 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010453 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10454 Py_DECREF(sub);
10455 Py_DECREF(str);
10456 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 kind1 = PyUnicode_KIND(str);
10459 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010460 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 buf1 = PyUnicode_DATA(str);
10462 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010463 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010464 if (kind2 > kind) {
10465 Py_DECREF(sub);
10466 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010467 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010468 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010469 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010470 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010471 if (!buf2) {
10472 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010473 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 return -1;
10475 }
10476 len1 = PyUnicode_GET_LENGTH(str);
10477 len2 = PyUnicode_GET_LENGTH(sub);
10478
Benjamin Petersonead6b532011-12-20 17:23:42 -060010479 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 case PyUnicode_1BYTE_KIND:
10481 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10482 break;
10483 case PyUnicode_2BYTE_KIND:
10484 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10485 break;
10486 case PyUnicode_4BYTE_KIND:
10487 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10488 break;
10489 default:
10490 result = -1;
10491 assert(0);
10492 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010493
10494 Py_DECREF(str);
10495 Py_DECREF(sub);
10496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 if (kind2 != kind)
10498 PyMem_Free(buf2);
10499
Guido van Rossum403d68b2000-03-13 15:55:09 +000010500 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010501}
10502
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503/* Concat to string or Unicode object giving a new Unicode object. */
10504
Alexander Belopolsky40018472011-02-26 01:02:56 +000010505PyObject *
10506PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010507{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010509 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010510 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511
10512 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010514 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010515 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010518 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519
10520 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010521 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010522 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010524 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010525 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010526 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528 }
10529
Victor Stinner488fa492011-12-12 00:01:39 +010010530 u_len = PyUnicode_GET_LENGTH(u);
10531 v_len = PyUnicode_GET_LENGTH(v);
10532 if (u_len > PY_SSIZE_T_MAX - v_len) {
10533 PyErr_SetString(PyExc_OverflowError,
10534 "strings are too large to concat");
10535 goto onError;
10536 }
10537 new_len = u_len + v_len;
10538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010540 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010541 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542
Guido van Rossumd57fd912000-03-10 22:53:23 +000010543 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010544 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010545 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010546 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010547 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10548 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549 Py_DECREF(u);
10550 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010551 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010553
Benjamin Peterson29060642009-01-31 22:14:21 +000010554 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010555 Py_XDECREF(u);
10556 Py_XDECREF(v);
10557 return NULL;
10558}
10559
Walter Dörwald1ab83302007-05-18 17:15:44 +000010560void
Victor Stinner23e56682011-10-03 03:54:37 +020010561PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010562{
Victor Stinner23e56682011-10-03 03:54:37 +020010563 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010564 Py_UCS4 maxchar, maxchar2;
10565 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010566
10567 if (p_left == NULL) {
10568 if (!PyErr_Occurred())
10569 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010570 return;
10571 }
Victor Stinner23e56682011-10-03 03:54:37 +020010572 left = *p_left;
10573 if (right == NULL || !PyUnicode_Check(left)) {
10574 if (!PyErr_Occurred())
10575 PyErr_BadInternalCall();
10576 goto error;
10577 }
10578
Benjamin Petersonbac79492012-01-14 13:34:47 -050010579 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010580 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010581 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010582 goto error;
10583
Victor Stinner488fa492011-12-12 00:01:39 +010010584 /* Shortcuts */
10585 if (left == unicode_empty) {
10586 Py_DECREF(left);
10587 Py_INCREF(right);
10588 *p_left = right;
10589 return;
10590 }
10591 if (right == unicode_empty)
10592 return;
10593
10594 left_len = PyUnicode_GET_LENGTH(left);
10595 right_len = PyUnicode_GET_LENGTH(right);
10596 if (left_len > PY_SSIZE_T_MAX - right_len) {
10597 PyErr_SetString(PyExc_OverflowError,
10598 "strings are too large to concat");
10599 goto error;
10600 }
10601 new_len = left_len + right_len;
10602
10603 if (unicode_modifiable(left)
10604 && PyUnicode_CheckExact(right)
10605 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010606 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10607 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010608 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010609 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010610 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10611 {
10612 /* append inplace */
10613 if (unicode_resize(p_left, new_len) != 0) {
10614 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10615 * deallocated so it cannot be put back into
10616 * 'variable'. The MemoryError is raised when there
10617 * is no value in 'variable', which might (very
10618 * remotely) be a cause of incompatibilities.
10619 */
10620 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010621 }
Victor Stinner488fa492011-12-12 00:01:39 +010010622 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010623 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010624 }
Victor Stinner488fa492011-12-12 00:01:39 +010010625 else {
10626 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10627 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010628 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010629
Victor Stinner488fa492011-12-12 00:01:39 +010010630 /* Concat the two Unicode strings */
10631 res = PyUnicode_New(new_len, maxchar);
10632 if (res == NULL)
10633 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010634 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10635 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010636 Py_DECREF(left);
10637 *p_left = res;
10638 }
10639 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010640 return;
10641
10642error:
Victor Stinner488fa492011-12-12 00:01:39 +010010643 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010644}
10645
10646void
10647PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10648{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010649 PyUnicode_Append(pleft, right);
10650 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010651}
10652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010653PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010654 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010656Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010657string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010658interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010659
10660static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010661unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010663 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010664 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010665 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 int kind1, kind2, kind;
10668 void *buf1, *buf2;
10669 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670
Jesus Ceaac451502011-04-20 17:09:23 +020010671 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10672 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010673 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010674
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 kind1 = PyUnicode_KIND(self);
10676 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010677 if (kind2 > kind1)
10678 return PyLong_FromLong(0);
10679 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 buf1 = PyUnicode_DATA(self);
10681 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010683 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 if (!buf2) {
10685 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 return NULL;
10687 }
10688 len1 = PyUnicode_GET_LENGTH(self);
10689 len2 = PyUnicode_GET_LENGTH(substring);
10690
10691 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010692 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010693 case PyUnicode_1BYTE_KIND:
10694 iresult = ucs1lib_count(
10695 ((Py_UCS1*)buf1) + start, end - start,
10696 buf2, len2, PY_SSIZE_T_MAX
10697 );
10698 break;
10699 case PyUnicode_2BYTE_KIND:
10700 iresult = ucs2lib_count(
10701 ((Py_UCS2*)buf1) + start, end - start,
10702 buf2, len2, PY_SSIZE_T_MAX
10703 );
10704 break;
10705 case PyUnicode_4BYTE_KIND:
10706 iresult = ucs4lib_count(
10707 ((Py_UCS4*)buf1) + start, end - start,
10708 buf2, len2, PY_SSIZE_T_MAX
10709 );
10710 break;
10711 default:
10712 assert(0); iresult = 0;
10713 }
10714
10715 result = PyLong_FromSsize_t(iresult);
10716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 if (kind2 != kind)
10718 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719
10720 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010721
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722 return result;
10723}
10724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010725PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010726 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010728Encode S using the codec registered for encoding. Default encoding\n\
10729is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010730handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010731a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10732'xmlcharrefreplace' as well as any other name registered with\n\
10733codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734
10735static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010736unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010738 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739 char *encoding = NULL;
10740 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010741
Benjamin Peterson308d6372009-09-18 21:42:35 +000010742 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10743 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010744 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010745 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010746}
10747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010748PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010749 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010750\n\
10751Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010752If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753
10754static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010755unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010757 Py_ssize_t i, j, line_pos, src_len, incr;
10758 Py_UCS4 ch;
10759 PyObject *u;
10760 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010762 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010763 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764
10765 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010766 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767
Antoine Pitrou22425222011-10-04 19:10:51 +020010768 if (PyUnicode_READY(self) == -1)
10769 return NULL;
10770
Thomas Wouters7e474022000-07-16 12:04:32 +000010771 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010772 src_len = PyUnicode_GET_LENGTH(self);
10773 i = j = line_pos = 0;
10774 kind = PyUnicode_KIND(self);
10775 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010776 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010777 for (; i < src_len; i++) {
10778 ch = PyUnicode_READ(kind, src_data, i);
10779 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010780 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010781 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010782 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010783 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010784 goto overflow;
10785 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010786 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010787 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010788 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010789 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010790 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010791 goto overflow;
10792 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010793 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010794 if (ch == '\n' || ch == '\r')
10795 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010796 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010797 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010798 if (!found)
10799 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010800
Guido van Rossumd57fd912000-03-10 22:53:23 +000010801 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010802 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010803 if (!u)
10804 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010805 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010806
Antoine Pitroue71d5742011-10-04 15:55:09 +020010807 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010808
Antoine Pitroue71d5742011-10-04 15:55:09 +020010809 for (; i < src_len; i++) {
10810 ch = PyUnicode_READ(kind, src_data, i);
10811 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010812 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010813 incr = tabsize - (line_pos % tabsize);
10814 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010815 FILL(kind, dest_data, ' ', j, incr);
10816 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010817 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010818 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010819 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010820 line_pos++;
10821 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010822 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010823 if (ch == '\n' || ch == '\r')
10824 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010826 }
10827 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010828 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010829
Antoine Pitroue71d5742011-10-04 15:55:09 +020010830 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010831 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10832 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833}
10834
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010835PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010836 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837\n\
10838Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010839such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840arguments start and end are interpreted as in slice notation.\n\
10841\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010842Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843
10844static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010845unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010847 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010848 Py_ssize_t start;
10849 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010850 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010851
Jesus Ceaac451502011-04-20 17:09:23 +020010852 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10853 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010854 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010856 if (PyUnicode_READY(self) == -1)
10857 return NULL;
10858 if (PyUnicode_READY(substring) == -1)
10859 return NULL;
10860
Victor Stinner7931d9a2011-11-04 00:22:48 +010010861 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010862
10863 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 if (result == -2)
10866 return NULL;
10867
Christian Heimes217cfd12007-12-02 14:31:20 +000010868 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010869}
10870
10871static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010872unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010873{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010874 void *data;
10875 enum PyUnicode_Kind kind;
10876 Py_UCS4 ch;
10877 PyObject *res;
10878
10879 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10880 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010882 }
10883 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10884 PyErr_SetString(PyExc_IndexError, "string index out of range");
10885 return NULL;
10886 }
10887 kind = PyUnicode_KIND(self);
10888 data = PyUnicode_DATA(self);
10889 ch = PyUnicode_READ(kind, data, index);
10890 if (ch < 256)
10891 return get_latin1_char(ch);
10892
10893 res = PyUnicode_New(1, ch);
10894 if (res == NULL)
10895 return NULL;
10896 kind = PyUnicode_KIND(res);
10897 data = PyUnicode_DATA(res);
10898 PyUnicode_WRITE(kind, data, 0, ch);
10899 assert(_PyUnicode_CheckConsistency(res, 1));
10900 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901}
10902
Guido van Rossumc2504932007-09-18 19:42:40 +000010903/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010904 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010905static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010906unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907{
Guido van Rossumc2504932007-09-18 19:42:40 +000010908 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010909 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010910
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010911#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050010912 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040010913#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 if (_PyUnicode_HASH(self) != -1)
10915 return _PyUnicode_HASH(self);
10916 if (PyUnicode_READY(self) == -1)
10917 return -1;
10918 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010010919 /*
10920 We make the hash of the empty string be 0, rather than using
10921 (prefix ^ suffix), since this slightly obfuscates the hash secret
10922 */
10923 if (len == 0) {
10924 _PyUnicode_HASH(self) = 0;
10925 return 0;
10926 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927
10928 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010010929#define HASH(P) \
10930 x ^= (Py_uhash_t) *P << 7; \
10931 while (--len >= 0) \
10932 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933
Georg Brandl2fb477c2012-02-21 00:33:36 +010010934 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010935 switch (PyUnicode_KIND(self)) {
10936 case PyUnicode_1BYTE_KIND: {
10937 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10938 HASH(c);
10939 break;
10940 }
10941 case PyUnicode_2BYTE_KIND: {
10942 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10943 HASH(s);
10944 break;
10945 }
10946 default: {
10947 Py_UCS4 *l;
10948 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10949 "Impossible switch case in unicode_hash");
10950 l = PyUnicode_4BYTE_DATA(self);
10951 HASH(l);
10952 break;
10953 }
10954 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010010955 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
10956 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957
Guido van Rossumc2504932007-09-18 19:42:40 +000010958 if (x == -1)
10959 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010960 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010961 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010965PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010966 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010967\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010968Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969
10970static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010972{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010973 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010974 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010975 Py_ssize_t start;
10976 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977
Jesus Ceaac451502011-04-20 17:09:23 +020010978 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10979 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 if (PyUnicode_READY(self) == -1)
10983 return NULL;
10984 if (PyUnicode_READY(substring) == -1)
10985 return NULL;
10986
Victor Stinner7931d9a2011-11-04 00:22:48 +010010987 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988
10989 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 if (result == -2)
10992 return NULL;
10993
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994 if (result < 0) {
10995 PyErr_SetString(PyExc_ValueError, "substring not found");
10996 return NULL;
10997 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010998
Christian Heimes217cfd12007-12-02 14:31:20 +000010999 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000}
11001
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011002PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011005Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011006at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007
11008static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011009unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 Py_ssize_t i, length;
11012 int kind;
11013 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014 int cased;
11015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 if (PyUnicode_READY(self) == -1)
11017 return NULL;
11018 length = PyUnicode_GET_LENGTH(self);
11019 kind = PyUnicode_KIND(self);
11020 data = PyUnicode_DATA(self);
11021
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023 if (length == 1)
11024 return PyBool_FromLong(
11025 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011027 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011029 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011030
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011032 for (i = 0; i < length; i++) {
11033 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011034
Benjamin Peterson29060642009-01-31 22:14:21 +000011035 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11036 return PyBool_FromLong(0);
11037 else if (!cased && Py_UNICODE_ISLOWER(ch))
11038 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011040 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011041}
11042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011043PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011044 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011046Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011047at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048
11049static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011050unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052 Py_ssize_t i, length;
11053 int kind;
11054 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055 int cased;
11056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011057 if (PyUnicode_READY(self) == -1)
11058 return NULL;
11059 length = PyUnicode_GET_LENGTH(self);
11060 kind = PyUnicode_KIND(self);
11061 data = PyUnicode_DATA(self);
11062
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011064 if (length == 1)
11065 return PyBool_FromLong(
11066 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011068 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011069 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011070 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011071
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011073 for (i = 0; i < length; i++) {
11074 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011075
Benjamin Peterson29060642009-01-31 22:14:21 +000011076 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11077 return PyBool_FromLong(0);
11078 else if (!cased && Py_UNICODE_ISUPPER(ch))
11079 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011081 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082}
11083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011084PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011085 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011086\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011087Return True if S is a titlecased string and there is at least one\n\
11088character in S, i.e. upper- and titlecase characters may only\n\
11089follow uncased characters and lowercase characters only cased ones.\n\
11090Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091
11092static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011093unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 Py_ssize_t i, length;
11096 int kind;
11097 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011098 int cased, previous_is_cased;
11099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 if (PyUnicode_READY(self) == -1)
11101 return NULL;
11102 length = PyUnicode_GET_LENGTH(self);
11103 kind = PyUnicode_KIND(self);
11104 data = PyUnicode_DATA(self);
11105
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011107 if (length == 1) {
11108 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11109 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11110 (Py_UNICODE_ISUPPER(ch) != 0));
11111 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011113 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011114 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011115 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011116
Guido van Rossumd57fd912000-03-10 22:53:23 +000011117 cased = 0;
11118 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 for (i = 0; i < length; i++) {
11120 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011121
Benjamin Peterson29060642009-01-31 22:14:21 +000011122 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11123 if (previous_is_cased)
11124 return PyBool_FromLong(0);
11125 previous_is_cased = 1;
11126 cased = 1;
11127 }
11128 else if (Py_UNICODE_ISLOWER(ch)) {
11129 if (!previous_is_cased)
11130 return PyBool_FromLong(0);
11131 previous_is_cased = 1;
11132 cased = 1;
11133 }
11134 else
11135 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011137 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138}
11139
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011140PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011141 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011143Return True if all characters in S are whitespace\n\
11144and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145
11146static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011147unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149 Py_ssize_t i, length;
11150 int kind;
11151 void *data;
11152
11153 if (PyUnicode_READY(self) == -1)
11154 return NULL;
11155 length = PyUnicode_GET_LENGTH(self);
11156 kind = PyUnicode_KIND(self);
11157 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160 if (length == 1)
11161 return PyBool_FromLong(
11162 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011164 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011166 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168 for (i = 0; i < length; i++) {
11169 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011170 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011171 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011173 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174}
11175
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011176PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011177 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011178\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011179Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011180and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011181
11182static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011183unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011184{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011185 Py_ssize_t i, length;
11186 int kind;
11187 void *data;
11188
11189 if (PyUnicode_READY(self) == -1)
11190 return NULL;
11191 length = PyUnicode_GET_LENGTH(self);
11192 kind = PyUnicode_KIND(self);
11193 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011194
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011195 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 if (length == 1)
11197 return PyBool_FromLong(
11198 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011199
11200 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011201 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011202 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 for (i = 0; i < length; i++) {
11205 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011206 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011207 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011208 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011209}
11210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011211PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011212 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011213\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011214Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011215and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011216
11217static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011218unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011219{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 int kind;
11221 void *data;
11222 Py_ssize_t len, i;
11223
11224 if (PyUnicode_READY(self) == -1)
11225 return NULL;
11226
11227 kind = PyUnicode_KIND(self);
11228 data = PyUnicode_DATA(self);
11229 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011230
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011231 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 if (len == 1) {
11233 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11234 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11235 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011236
11237 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011239 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241 for (i = 0; i < len; i++) {
11242 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011243 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011244 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011245 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011246 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011247}
11248
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011249PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011250 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011252Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011253False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254
11255static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011256unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011257{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011258 Py_ssize_t i, length;
11259 int kind;
11260 void *data;
11261
11262 if (PyUnicode_READY(self) == -1)
11263 return NULL;
11264 length = PyUnicode_GET_LENGTH(self);
11265 kind = PyUnicode_KIND(self);
11266 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011269 if (length == 1)
11270 return PyBool_FromLong(
11271 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011273 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011274 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011275 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011277 for (i = 0; i < length; i++) {
11278 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011279 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011281 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282}
11283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011284PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011285 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011287Return True if all characters in S are digits\n\
11288and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289
11290static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011291unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011293 Py_ssize_t i, length;
11294 int kind;
11295 void *data;
11296
11297 if (PyUnicode_READY(self) == -1)
11298 return NULL;
11299 length = PyUnicode_GET_LENGTH(self);
11300 kind = PyUnicode_KIND(self);
11301 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 if (length == 1) {
11305 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11306 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011309 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011310 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011311 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313 for (i = 0; i < length; i++) {
11314 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011315 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011317 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318}
11319
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011320PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011321 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011323Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011324False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325
11326static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011327unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 Py_ssize_t i, length;
11330 int kind;
11331 void *data;
11332
11333 if (PyUnicode_READY(self) == -1)
11334 return NULL;
11335 length = PyUnicode_GET_LENGTH(self);
11336 kind = PyUnicode_KIND(self);
11337 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338
Guido van Rossumd57fd912000-03-10 22:53:23 +000011339 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340 if (length == 1)
11341 return PyBool_FromLong(
11342 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011343
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011344 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011346 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 for (i = 0; i < length; i++) {
11349 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011350 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011352 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353}
11354
Martin v. Löwis47383402007-08-15 07:32:56 +000011355int
11356PyUnicode_IsIdentifier(PyObject *self)
11357{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 int kind;
11359 void *data;
11360 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011361 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011363 if (PyUnicode_READY(self) == -1) {
11364 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011365 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011366 }
11367
11368 /* Special case for empty strings */
11369 if (PyUnicode_GET_LENGTH(self) == 0)
11370 return 0;
11371 kind = PyUnicode_KIND(self);
11372 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011373
11374 /* PEP 3131 says that the first character must be in
11375 XID_Start and subsequent characters in XID_Continue,
11376 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011377 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011378 letters, digits, underscore). However, given the current
11379 definition of XID_Start and XID_Continue, it is sufficient
11380 to check just for these, except that _ must be allowed
11381 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011383 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011384 return 0;
11385
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011386 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011388 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011389 return 1;
11390}
11391
11392PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011393 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011394\n\
11395Return True if S is a valid identifier according\n\
11396to the language definition.");
11397
11398static PyObject*
11399unicode_isidentifier(PyObject *self)
11400{
11401 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11402}
11403
Georg Brandl559e5d72008-06-11 18:37:52 +000011404PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011405 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011406\n\
11407Return True if all characters in S are considered\n\
11408printable in repr() or S is empty, False otherwise.");
11409
11410static PyObject*
11411unicode_isprintable(PyObject *self)
11412{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 Py_ssize_t i, length;
11414 int kind;
11415 void *data;
11416
11417 if (PyUnicode_READY(self) == -1)
11418 return NULL;
11419 length = PyUnicode_GET_LENGTH(self);
11420 kind = PyUnicode_KIND(self);
11421 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011422
11423 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 if (length == 1)
11425 return PyBool_FromLong(
11426 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 for (i = 0; i < length; i++) {
11429 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011430 Py_RETURN_FALSE;
11431 }
11432 }
11433 Py_RETURN_TRUE;
11434}
11435
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011436PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011437 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438\n\
11439Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011440iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441
11442static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011443unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011445 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446}
11447
Martin v. Löwis18e16552006-02-15 17:27:45 +000011448static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011449unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011451 if (PyUnicode_READY(self) == -1)
11452 return -1;
11453 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011454}
11455
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011456PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011457 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011458\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011459Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011460done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
11462static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011463unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011465 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 Py_UCS4 fillchar = ' ';
11467
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011468 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469 return NULL;
11470
Benjamin Petersonbac79492012-01-14 13:34:47 -050011471 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011472 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011473
Victor Stinnerc4b49542011-12-11 22:44:26 +010011474 if (PyUnicode_GET_LENGTH(self) >= width)
11475 return unicode_result_unchanged(self);
11476
11477 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478}
11479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011480PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011481 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011483Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484
11485static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011486unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011488 if (PyUnicode_READY(self) == -1)
11489 return NULL;
11490 if (PyUnicode_IS_ASCII(self))
11491 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011492 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493}
11494
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011495#define LEFTSTRIP 0
11496#define RIGHTSTRIP 1
11497#define BOTHSTRIP 2
11498
11499/* Arrays indexed by above */
11500static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11501
11502#define STRIPNAME(i) (stripformat[i]+3)
11503
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011504/* externally visible for str.strip(unicode) */
11505PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011506_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011507{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508 void *data;
11509 int kind;
11510 Py_ssize_t i, j, len;
11511 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11514 return NULL;
11515
11516 kind = PyUnicode_KIND(self);
11517 data = PyUnicode_DATA(self);
11518 len = PyUnicode_GET_LENGTH(self);
11519 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11520 PyUnicode_DATA(sepobj),
11521 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011522
Benjamin Peterson14339b62009-01-31 16:36:08 +000011523 i = 0;
11524 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525 while (i < len &&
11526 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011527 i++;
11528 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011529 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011530
Benjamin Peterson14339b62009-01-31 16:36:08 +000011531 j = len;
11532 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011533 do {
11534 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 } while (j >= i &&
11536 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011537 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011538 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011539
Victor Stinner7931d9a2011-11-04 00:22:48 +010011540 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011541}
11542
11543PyObject*
11544PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11545{
11546 unsigned char *data;
11547 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011548 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011549
Victor Stinnerde636f32011-10-01 03:55:54 +020011550 if (PyUnicode_READY(self) == -1)
11551 return NULL;
11552
Victor Stinner684d5fd2012-05-03 02:32:34 +020011553 length = PyUnicode_GET_LENGTH(self);
11554 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011555
Victor Stinner684d5fd2012-05-03 02:32:34 +020011556 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011557 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558
Victor Stinnerde636f32011-10-01 03:55:54 +020011559 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011560 PyErr_SetString(PyExc_IndexError, "string index out of range");
11561 return NULL;
11562 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020011563 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020011564 Py_INCREF(unicode_empty);
11565 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020011566 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020011567
Victor Stinner684d5fd2012-05-03 02:32:34 +020011568 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011569 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011570 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011571 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011572 }
11573 else {
11574 kind = PyUnicode_KIND(self);
11575 data = PyUnicode_1BYTE_DATA(self);
11576 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011577 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011578 length);
11579 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011580}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581
11582static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011583do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011585 int kind;
11586 void *data;
11587 Py_ssize_t len, i, j;
11588
11589 if (PyUnicode_READY(self) == -1)
11590 return NULL;
11591
11592 kind = PyUnicode_KIND(self);
11593 data = PyUnicode_DATA(self);
11594 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011595
Benjamin Peterson14339b62009-01-31 16:36:08 +000011596 i = 0;
11597 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011598 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011599 i++;
11600 }
11601 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011602
Benjamin Peterson14339b62009-01-31 16:36:08 +000011603 j = len;
11604 if (striptype != LEFTSTRIP) {
11605 do {
11606 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011608 j++;
11609 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011610
Victor Stinner7931d9a2011-11-04 00:22:48 +010011611 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612}
11613
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011614
11615static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011616do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011617{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011618 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011619
Benjamin Peterson14339b62009-01-31 16:36:08 +000011620 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11621 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011622
Benjamin Peterson14339b62009-01-31 16:36:08 +000011623 if (sep != NULL && sep != Py_None) {
11624 if (PyUnicode_Check(sep))
11625 return _PyUnicode_XStrip(self, striptype, sep);
11626 else {
11627 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011628 "%s arg must be None or str",
11629 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011630 return NULL;
11631 }
11632 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011633
Benjamin Peterson14339b62009-01-31 16:36:08 +000011634 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011635}
11636
11637
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011638PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011639 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011640\n\
11641Return a copy of the string S with leading and trailing\n\
11642whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011643If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011644
11645static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011646unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011647{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011648 if (PyTuple_GET_SIZE(args) == 0)
11649 return do_strip(self, BOTHSTRIP); /* Common case */
11650 else
11651 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011652}
11653
11654
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011655PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011656 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011657\n\
11658Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011659If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011660
11661static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011662unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011663{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011664 if (PyTuple_GET_SIZE(args) == 0)
11665 return do_strip(self, LEFTSTRIP); /* Common case */
11666 else
11667 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011668}
11669
11670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011671PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011672 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011673\n\
11674Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011675If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011676
11677static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011678unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011679{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011680 if (PyTuple_GET_SIZE(args) == 0)
11681 return do_strip(self, RIGHTSTRIP); /* Common case */
11682 else
11683 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011684}
11685
11686
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011688unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011690 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011691 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692
Georg Brandl222de0f2009-04-12 12:01:50 +000011693 if (len < 1) {
11694 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011695 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697
Victor Stinnerc4b49542011-12-11 22:44:26 +010011698 /* no repeat, return original string */
11699 if (len == 1)
11700 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011701
Benjamin Petersonbac79492012-01-14 13:34:47 -050011702 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 return NULL;
11704
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011705 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011706 PyErr_SetString(PyExc_OverflowError,
11707 "repeated string is too long");
11708 return NULL;
11709 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011710 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011711
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011712 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713 if (!u)
11714 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011715 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717 if (PyUnicode_GET_LENGTH(str) == 1) {
11718 const int kind = PyUnicode_KIND(str);
11719 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011720 if (kind == PyUnicode_1BYTE_KIND) {
11721 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011722 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011723 }
11724 else if (kind == PyUnicode_2BYTE_KIND) {
11725 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011726 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011727 ucs2[n] = fill_char;
11728 } else {
11729 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11730 assert(kind == PyUnicode_4BYTE_KIND);
11731 for (n = 0; n < len; ++n)
11732 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011733 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 }
11735 else {
11736 /* number of characters copied this far */
11737 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011738 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 char *to = (char *) PyUnicode_DATA(u);
11740 Py_MEMCPY(to, PyUnicode_DATA(str),
11741 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011742 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011743 n = (done <= nchars-done) ? done : nchars-done;
11744 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011745 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747 }
11748
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011749 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011750 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751}
11752
Alexander Belopolsky40018472011-02-26 01:02:56 +000011753PyObject *
11754PyUnicode_Replace(PyObject *obj,
11755 PyObject *subobj,
11756 PyObject *replobj,
11757 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758{
11759 PyObject *self;
11760 PyObject *str1;
11761 PyObject *str2;
11762 PyObject *result;
11763
11764 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011765 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011766 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011768 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011769 Py_DECREF(self);
11770 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771 }
11772 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011773 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 Py_DECREF(self);
11775 Py_DECREF(str1);
11776 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011778 if (PyUnicode_READY(self) == -1 ||
11779 PyUnicode_READY(str1) == -1 ||
11780 PyUnicode_READY(str2) == -1)
11781 result = NULL;
11782 else
11783 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784 Py_DECREF(self);
11785 Py_DECREF(str1);
11786 Py_DECREF(str2);
11787 return result;
11788}
11789
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011790PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011791 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792\n\
11793Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011794old replaced by new. If the optional argument count is\n\
11795given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796
11797static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011798unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 PyObject *str1;
11801 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011802 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803 PyObject *result;
11804
Martin v. Löwis18e16552006-02-15 17:27:45 +000011805 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011807 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011808 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011810 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011811 return NULL;
11812 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011813 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011814 Py_DECREF(str1);
11815 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011816 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011817 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11818 result = NULL;
11819 else
11820 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821
11822 Py_DECREF(str1);
11823 Py_DECREF(str2);
11824 return result;
11825}
11826
Alexander Belopolsky40018472011-02-26 01:02:56 +000011827static PyObject *
11828unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011830 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 Py_ssize_t isize;
11832 Py_ssize_t osize, squote, dquote, i, o;
11833 Py_UCS4 max, quote;
11834 int ikind, okind;
11835 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011836
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011838 return NULL;
11839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 isize = PyUnicode_GET_LENGTH(unicode);
11841 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 /* Compute length of output, quote characters, and
11844 maximum character */
11845 osize = 2; /* quotes */
11846 max = 127;
11847 squote = dquote = 0;
11848 ikind = PyUnicode_KIND(unicode);
11849 for (i = 0; i < isize; i++) {
11850 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11851 switch (ch) {
11852 case '\'': squote++; osize++; break;
11853 case '"': dquote++; osize++; break;
11854 case '\\': case '\t': case '\r': case '\n':
11855 osize += 2; break;
11856 default:
11857 /* Fast-path ASCII */
11858 if (ch < ' ' || ch == 0x7f)
11859 osize += 4; /* \xHH */
11860 else if (ch < 0x7f)
11861 osize++;
11862 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11863 osize++;
11864 max = ch > max ? ch : max;
11865 }
11866 else if (ch < 0x100)
11867 osize += 4; /* \xHH */
11868 else if (ch < 0x10000)
11869 osize += 6; /* \uHHHH */
11870 else
11871 osize += 10; /* \uHHHHHHHH */
11872 }
11873 }
11874
11875 quote = '\'';
11876 if (squote) {
11877 if (dquote)
11878 /* Both squote and dquote present. Use squote,
11879 and escape them */
11880 osize += squote;
11881 else
11882 quote = '"';
11883 }
11884
11885 repr = PyUnicode_New(osize, max);
11886 if (repr == NULL)
11887 return NULL;
11888 okind = PyUnicode_KIND(repr);
11889 odata = PyUnicode_DATA(repr);
11890
11891 PyUnicode_WRITE(okind, odata, 0, quote);
11892 PyUnicode_WRITE(okind, odata, osize-1, quote);
11893
11894 for (i = 0, o = 1; i < isize; i++) {
11895 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011896
11897 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 if ((ch == quote) || (ch == '\\')) {
11899 PyUnicode_WRITE(okind, odata, o++, '\\');
11900 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011901 continue;
11902 }
11903
Benjamin Peterson29060642009-01-31 22:14:21 +000011904 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011905 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 PyUnicode_WRITE(okind, odata, o++, '\\');
11907 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011908 }
11909 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910 PyUnicode_WRITE(okind, odata, o++, '\\');
11911 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011912 }
11913 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 PyUnicode_WRITE(okind, odata, o++, '\\');
11915 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011916 }
11917
11918 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011919 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 PyUnicode_WRITE(okind, odata, o++, '\\');
11921 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011922 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11923 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011924 }
11925
Georg Brandl559e5d72008-06-11 18:37:52 +000011926 /* Copy ASCII characters as-is */
11927 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011929 }
11930
Benjamin Peterson29060642009-01-31 22:14:21 +000011931 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011932 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011933 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011934 (categories Z* and C* except ASCII space)
11935 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011937 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000011938 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011941 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11942 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011943 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011944 /* Map 16-bit characters to '\uxxxx' */
11945 else if (ch <= 0xffff) {
11946 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011947 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11948 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11949 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11950 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011951 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011952 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011953 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000011954 PyUnicode_WRITE(okind, odata, o++, 'U');
11955 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11956 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11957 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11958 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020011959 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11960 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11961 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11962 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011963 }
11964 }
11965 /* Copy characters as-is */
11966 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011967 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011968 }
11969 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011970 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011972 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011973 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974}
11975
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011976PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011977 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978\n\
11979Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011980such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981arguments start and end are interpreted as in slice notation.\n\
11982\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011983Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984
11985static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011988 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011989 Py_ssize_t start;
11990 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011991 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992
Jesus Ceaac451502011-04-20 17:09:23 +020011993 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11994 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011995 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 if (PyUnicode_READY(self) == -1)
11998 return NULL;
11999 if (PyUnicode_READY(substring) == -1)
12000 return NULL;
12001
Victor Stinner7931d9a2011-11-04 00:22:48 +010012002 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003
12004 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 if (result == -2)
12007 return NULL;
12008
Christian Heimes217cfd12007-12-02 14:31:20 +000012009 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010}
12011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012012PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012013 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012015Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012016
12017static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012019{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012020 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012021 Py_ssize_t start;
12022 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012023 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024
Jesus Ceaac451502011-04-20 17:09:23 +020012025 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12026 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012027 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012029 if (PyUnicode_READY(self) == -1)
12030 return NULL;
12031 if (PyUnicode_READY(substring) == -1)
12032 return NULL;
12033
Victor Stinner7931d9a2011-11-04 00:22:48 +010012034 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035
12036 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 if (result == -2)
12039 return NULL;
12040
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041 if (result < 0) {
12042 PyErr_SetString(PyExc_ValueError, "substring not found");
12043 return NULL;
12044 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045
Christian Heimes217cfd12007-12-02 14:31:20 +000012046 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047}
12048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012049PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012050 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012052Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012053done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054
12055static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012056unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012058 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012059 Py_UCS4 fillchar = ' ';
12060
Victor Stinnere9a29352011-10-01 02:14:59 +020012061 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012062 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012063
Benjamin Petersonbac79492012-01-14 13:34:47 -050012064 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065 return NULL;
12066
Victor Stinnerc4b49542011-12-11 22:44:26 +010012067 if (PyUnicode_GET_LENGTH(self) >= width)
12068 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069
Victor Stinnerc4b49542011-12-11 22:44:26 +010012070 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071}
12072
Alexander Belopolsky40018472011-02-26 01:02:56 +000012073PyObject *
12074PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075{
12076 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012077
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078 s = PyUnicode_FromObject(s);
12079 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012080 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012081 if (sep != NULL) {
12082 sep = PyUnicode_FromObject(sep);
12083 if (sep == NULL) {
12084 Py_DECREF(s);
12085 return NULL;
12086 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087 }
12088
Victor Stinner9310abb2011-10-05 00:59:23 +020012089 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012090
12091 Py_DECREF(s);
12092 Py_XDECREF(sep);
12093 return result;
12094}
12095
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012096PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012097 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098\n\
12099Return a list of the words in S, using sep as the\n\
12100delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012101splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012102whitespace string is a separator and empty strings are\n\
12103removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104
12105static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012106unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012108 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012110 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012112 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12113 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114 return NULL;
12115
12116 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012117 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012119 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012121 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122}
12123
Thomas Wouters477c8d52006-05-27 19:21:47 +000012124PyObject *
12125PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12126{
12127 PyObject* str_obj;
12128 PyObject* sep_obj;
12129 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 int kind1, kind2, kind;
12131 void *buf1 = NULL, *buf2 = NULL;
12132 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012133
12134 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012135 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012136 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012137 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012138 if (!sep_obj) {
12139 Py_DECREF(str_obj);
12140 return NULL;
12141 }
12142 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12143 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012144 Py_DECREF(str_obj);
12145 return NULL;
12146 }
12147
Victor Stinner14f8f022011-10-05 20:58:25 +020012148 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012150 kind = Py_MAX(kind1, kind2);
12151 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012152 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012153 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154 if (!buf1)
12155 goto onError;
12156 buf2 = PyUnicode_DATA(sep_obj);
12157 if (kind2 != kind)
12158 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12159 if (!buf2)
12160 goto onError;
12161 len1 = PyUnicode_GET_LENGTH(str_obj);
12162 len2 = PyUnicode_GET_LENGTH(sep_obj);
12163
Benjamin Petersonead6b532011-12-20 17:23:42 -060012164 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012166 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12167 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12168 else
12169 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 break;
12171 case PyUnicode_2BYTE_KIND:
12172 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12173 break;
12174 case PyUnicode_4BYTE_KIND:
12175 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12176 break;
12177 default:
12178 assert(0);
12179 out = 0;
12180 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012181
12182 Py_DECREF(sep_obj);
12183 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 if (kind1 != kind)
12185 PyMem_Free(buf1);
12186 if (kind2 != kind)
12187 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012188
12189 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 onError:
12191 Py_DECREF(sep_obj);
12192 Py_DECREF(str_obj);
12193 if (kind1 != kind && buf1)
12194 PyMem_Free(buf1);
12195 if (kind2 != kind && buf2)
12196 PyMem_Free(buf2);
12197 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012198}
12199
12200
12201PyObject *
12202PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12203{
12204 PyObject* str_obj;
12205 PyObject* sep_obj;
12206 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 int kind1, kind2, kind;
12208 void *buf1 = NULL, *buf2 = NULL;
12209 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012210
12211 str_obj = PyUnicode_FromObject(str_in);
12212 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012213 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012214 sep_obj = PyUnicode_FromObject(sep_in);
12215 if (!sep_obj) {
12216 Py_DECREF(str_obj);
12217 return NULL;
12218 }
12219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 kind1 = PyUnicode_KIND(str_in);
12221 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012222 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 buf1 = PyUnicode_DATA(str_in);
12224 if (kind1 != kind)
12225 buf1 = _PyUnicode_AsKind(str_in, kind);
12226 if (!buf1)
12227 goto onError;
12228 buf2 = PyUnicode_DATA(sep_obj);
12229 if (kind2 != kind)
12230 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12231 if (!buf2)
12232 goto onError;
12233 len1 = PyUnicode_GET_LENGTH(str_obj);
12234 len2 = PyUnicode_GET_LENGTH(sep_obj);
12235
Benjamin Petersonead6b532011-12-20 17:23:42 -060012236 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012238 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12239 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12240 else
12241 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012242 break;
12243 case PyUnicode_2BYTE_KIND:
12244 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12245 break;
12246 case PyUnicode_4BYTE_KIND:
12247 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12248 break;
12249 default:
12250 assert(0);
12251 out = 0;
12252 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012253
12254 Py_DECREF(sep_obj);
12255 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012256 if (kind1 != kind)
12257 PyMem_Free(buf1);
12258 if (kind2 != kind)
12259 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012260
12261 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012262 onError:
12263 Py_DECREF(sep_obj);
12264 Py_DECREF(str_obj);
12265 if (kind1 != kind && buf1)
12266 PyMem_Free(buf1);
12267 if (kind2 != kind && buf2)
12268 PyMem_Free(buf2);
12269 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012270}
12271
12272PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012273 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012274\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012275Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012276the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012277found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012278
12279static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012280unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012281{
Victor Stinner9310abb2011-10-05 00:59:23 +020012282 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012283}
12284
12285PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012286 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012287\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012288Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012289the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012290separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012291
12292static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012293unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012294{
Victor Stinner9310abb2011-10-05 00:59:23 +020012295 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012296}
12297
Alexander Belopolsky40018472011-02-26 01:02:56 +000012298PyObject *
12299PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012300{
12301 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012302
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012303 s = PyUnicode_FromObject(s);
12304 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012305 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012306 if (sep != NULL) {
12307 sep = PyUnicode_FromObject(sep);
12308 if (sep == NULL) {
12309 Py_DECREF(s);
12310 return NULL;
12311 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012312 }
12313
Victor Stinner9310abb2011-10-05 00:59:23 +020012314 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012315
12316 Py_DECREF(s);
12317 Py_XDECREF(sep);
12318 return result;
12319}
12320
12321PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012322 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012323\n\
12324Return a list of the words in S, using sep as the\n\
12325delimiter string, starting at the end of the string and\n\
12326working to the front. If maxsplit is given, at most maxsplit\n\
12327splits are done. If sep is not specified, any whitespace string\n\
12328is a separator.");
12329
12330static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012331unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012332{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012333 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012334 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012335 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012336
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012337 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12338 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012339 return NULL;
12340
12341 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012342 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012343 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012344 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012345 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012346 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012347}
12348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012349PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012350 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012351\n\
12352Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012353Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012354is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355
12356static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012357unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012359 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012360 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012361
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012362 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12363 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012364 return NULL;
12365
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012366 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012367}
12368
12369static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012370PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012371{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012372 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373}
12374
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012375PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012376 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377\n\
12378Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012379and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380
12381static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012382unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012384 if (PyUnicode_READY(self) == -1)
12385 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012386 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387}
12388
Georg Brandlceee0772007-11-27 23:48:05 +000012389PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012390 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012391\n\
12392Return a translation table usable for str.translate().\n\
12393If there is only one argument, it must be a dictionary mapping Unicode\n\
12394ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012395Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012396If there are two arguments, they must be strings of equal length, and\n\
12397in the resulting dictionary, each character in x will be mapped to the\n\
12398character at the same position in y. If there is a third argument, it\n\
12399must be a string, whose characters will be mapped to None in the result.");
12400
12401static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012402unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012403{
12404 PyObject *x, *y = NULL, *z = NULL;
12405 PyObject *new = NULL, *key, *value;
12406 Py_ssize_t i = 0;
12407 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012408
Georg Brandlceee0772007-11-27 23:48:05 +000012409 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12410 return NULL;
12411 new = PyDict_New();
12412 if (!new)
12413 return NULL;
12414 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415 int x_kind, y_kind, z_kind;
12416 void *x_data, *y_data, *z_data;
12417
Georg Brandlceee0772007-11-27 23:48:05 +000012418 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012419 if (!PyUnicode_Check(x)) {
12420 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12421 "be a string if there is a second argument");
12422 goto err;
12423 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012425 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12426 "arguments must have equal length");
12427 goto err;
12428 }
12429 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430 x_kind = PyUnicode_KIND(x);
12431 y_kind = PyUnicode_KIND(y);
12432 x_data = PyUnicode_DATA(x);
12433 y_data = PyUnicode_DATA(y);
12434 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12435 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012436 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012437 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012438 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012439 if (!value) {
12440 Py_DECREF(key);
12441 goto err;
12442 }
Georg Brandlceee0772007-11-27 23:48:05 +000012443 res = PyDict_SetItem(new, key, value);
12444 Py_DECREF(key);
12445 Py_DECREF(value);
12446 if (res < 0)
12447 goto err;
12448 }
12449 /* create entries for deleting chars in z */
12450 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451 z_kind = PyUnicode_KIND(z);
12452 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012453 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012454 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012455 if (!key)
12456 goto err;
12457 res = PyDict_SetItem(new, key, Py_None);
12458 Py_DECREF(key);
12459 if (res < 0)
12460 goto err;
12461 }
12462 }
12463 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464 int kind;
12465 void *data;
12466
Georg Brandlceee0772007-11-27 23:48:05 +000012467 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012468 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012469 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12470 "to maketrans it must be a dict");
12471 goto err;
12472 }
12473 /* copy entries into the new dict, converting string keys to int keys */
12474 while (PyDict_Next(x, &i, &key, &value)) {
12475 if (PyUnicode_Check(key)) {
12476 /* convert string keys to integer keys */
12477 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012478 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012479 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12480 "table must be of length 1");
12481 goto err;
12482 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 kind = PyUnicode_KIND(key);
12484 data = PyUnicode_DATA(key);
12485 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012486 if (!newkey)
12487 goto err;
12488 res = PyDict_SetItem(new, newkey, value);
12489 Py_DECREF(newkey);
12490 if (res < 0)
12491 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012492 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012493 /* just keep integer keys */
12494 if (PyDict_SetItem(new, key, value) < 0)
12495 goto err;
12496 } else {
12497 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12498 "be strings or integers");
12499 goto err;
12500 }
12501 }
12502 }
12503 return new;
12504 err:
12505 Py_DECREF(new);
12506 return NULL;
12507}
12508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012509PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012510 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511\n\
12512Return a copy of the string S, where all characters have been mapped\n\
12513through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012514Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012515Unmapped characters are left untouched. Characters mapped to None\n\
12516are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517
12518static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012521 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522}
12523
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012524PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012525 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012527Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528
12529static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012530unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012532 if (PyUnicode_READY(self) == -1)
12533 return NULL;
12534 if (PyUnicode_IS_ASCII(self))
12535 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012536 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537}
12538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012539PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012540 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012542Pad a numeric string S with zeros on the left, to fill a field\n\
12543of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544
12545static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012546unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012548 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012549 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012550 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 int kind;
12552 void *data;
12553 Py_UCS4 chr;
12554
Martin v. Löwis18e16552006-02-15 17:27:45 +000012555 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012556 return NULL;
12557
Benjamin Petersonbac79492012-01-14 13:34:47 -050012558 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012559 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560
Victor Stinnerc4b49542011-12-11 22:44:26 +010012561 if (PyUnicode_GET_LENGTH(self) >= width)
12562 return unicode_result_unchanged(self);
12563
12564 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565
12566 u = pad(self, fill, 0, '0');
12567
Walter Dörwald068325e2002-04-15 13:36:47 +000012568 if (u == NULL)
12569 return NULL;
12570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012571 kind = PyUnicode_KIND(u);
12572 data = PyUnicode_DATA(u);
12573 chr = PyUnicode_READ(kind, data, fill);
12574
12575 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012576 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 PyUnicode_WRITE(kind, data, 0, chr);
12578 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579 }
12580
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012581 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012582 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584
12585#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012586static PyObject *
12587unicode__decimal2ascii(PyObject *self)
12588{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012589 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012590}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591#endif
12592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012593PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012594 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012595\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012596Return True if S starts with the specified prefix, False otherwise.\n\
12597With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012598With optional end, stop comparing S at that position.\n\
12599prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012600
12601static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012602unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012603 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012604{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012605 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012606 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012607 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012608 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012609 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610
Jesus Ceaac451502011-04-20 17:09:23 +020012611 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012612 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012613 if (PyTuple_Check(subobj)) {
12614 Py_ssize_t i;
12615 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012616 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012617 if (substring == NULL)
12618 return NULL;
12619 result = tailmatch(self, substring, start, end, -1);
12620 Py_DECREF(substring);
12621 if (result) {
12622 Py_RETURN_TRUE;
12623 }
12624 }
12625 /* nothing matched */
12626 Py_RETURN_FALSE;
12627 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012628 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012629 if (substring == NULL) {
12630 if (PyErr_ExceptionMatches(PyExc_TypeError))
12631 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12632 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012633 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012634 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012635 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012637 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638}
12639
12640
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012641PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012642 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012644Return True if S ends with the specified suffix, False otherwise.\n\
12645With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012646With optional end, stop comparing S at that position.\n\
12647suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648
12649static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012650unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012651 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012653 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012654 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012655 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012656 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012657 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658
Jesus Ceaac451502011-04-20 17:09:23 +020012659 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012661 if (PyTuple_Check(subobj)) {
12662 Py_ssize_t i;
12663 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012664 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012665 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012666 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012667 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012668 result = tailmatch(self, substring, start, end, +1);
12669 Py_DECREF(substring);
12670 if (result) {
12671 Py_RETURN_TRUE;
12672 }
12673 }
12674 Py_RETURN_FALSE;
12675 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012676 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012677 if (substring == NULL) {
12678 if (PyErr_ExceptionMatches(PyExc_TypeError))
12679 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12680 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012681 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012682 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012683 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012685 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686}
12687
Victor Stinner202fdca2012-05-07 12:47:02 +020012688Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012689_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012690{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012691 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012692 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12693 writer->data = PyUnicode_DATA(writer->buffer);
12694 writer->kind = PyUnicode_KIND(writer->buffer);
12695}
12696
Victor Stinnerd3f08822012-05-29 12:57:52 +020012697void
12698_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012699{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012700 memset(writer, 0, sizeof(*writer));
12701#ifdef Py_DEBUG
12702 writer->kind = 5; /* invalid kind */
12703#endif
12704 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012705 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012706}
12707
Victor Stinnerd3f08822012-05-29 12:57:52 +020012708int
12709_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12710 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012711{
12712 Py_ssize_t newlen;
12713 PyObject *newbuffer;
12714
Victor Stinnerd3f08822012-05-29 12:57:52 +020012715 assert(length > 0);
12716
Victor Stinner202fdca2012-05-07 12:47:02 +020012717 if (length > PY_SSIZE_T_MAX - writer->pos) {
12718 PyErr_NoMemory();
12719 return -1;
12720 }
12721 newlen = writer->pos + length;
12722
Victor Stinnerd3f08822012-05-29 12:57:52 +020012723 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012724 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012725 /* overallocate 25% to limit the number of resize */
12726 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12727 newlen += newlen / 4;
12728 if (newlen < writer->min_length)
12729 newlen = writer->min_length;
12730 }
12731 writer->buffer = PyUnicode_New(newlen, maxchar);
12732 if (writer->buffer == NULL)
12733 return -1;
12734 _PyUnicodeWriter_Update(writer);
12735 return 0;
12736 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012737
Victor Stinnerd3f08822012-05-29 12:57:52 +020012738 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012739 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012740 /* overallocate 25% to limit the number of resize */
12741 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12742 newlen += newlen / 4;
12743 if (newlen < writer->min_length)
12744 newlen = writer->min_length;
12745 }
12746
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012747 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012748 /* resize + widen */
12749 newbuffer = PyUnicode_New(newlen, maxchar);
12750 if (newbuffer == NULL)
12751 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012752 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12753 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012754 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012755 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012756 }
12757 else {
12758 newbuffer = resize_compact(writer->buffer, newlen);
12759 if (newbuffer == NULL)
12760 return -1;
12761 }
12762 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012763 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012764 }
12765 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012766 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012767 newbuffer = PyUnicode_New(writer->size, maxchar);
12768 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012769 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012770 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12771 writer->buffer, 0, writer->pos);
12772 Py_DECREF(writer->buffer);
12773 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012774 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012775 }
12776 return 0;
12777}
12778
Victor Stinnerd3f08822012-05-29 12:57:52 +020012779int
12780_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12781{
12782 Py_UCS4 maxchar;
12783 Py_ssize_t len;
12784
12785 if (PyUnicode_READY(str) == -1)
12786 return -1;
12787 len = PyUnicode_GET_LENGTH(str);
12788 if (len == 0)
12789 return 0;
12790 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12791 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012792 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012793 Py_INCREF(str);
12794 writer->buffer = str;
12795 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012796 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012797 writer->size = 0;
12798 writer->pos += len;
12799 return 0;
12800 }
12801 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12802 return -1;
12803 }
12804 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12805 str, 0, len);
12806 writer->pos += len;
12807 return 0;
12808}
12809
Victor Stinnere215d962012-10-06 23:03:36 +020012810int
12811_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
12812{
12813 Py_UCS4 maxchar;
12814
12815 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
12816 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
12817 return -1;
12818 unicode_write_cstr(writer->buffer, writer->pos, str, len);
12819 writer->pos += len;
12820 return 0;
12821}
12822
Victor Stinnerd3f08822012-05-29 12:57:52 +020012823PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012824_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012825{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012826 if (writer->pos == 0) {
12827 Py_XDECREF(writer->buffer);
12828 Py_INCREF(unicode_empty);
12829 return unicode_empty;
12830 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012831 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012832 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12833 return writer->buffer;
12834 }
12835 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12836 PyObject *newbuffer;
12837 newbuffer = resize_compact(writer->buffer, writer->pos);
12838 if (newbuffer == NULL) {
12839 Py_DECREF(writer->buffer);
12840 return NULL;
12841 }
12842 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012843 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012844 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012845 return writer->buffer;
12846}
12847
Victor Stinnerd3f08822012-05-29 12:57:52 +020012848void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012849_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012850{
12851 Py_CLEAR(writer->buffer);
12852}
12853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012854#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012855
12856PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012857 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012858\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012859Return a formatted version of S, using substitutions from args and kwargs.\n\
12860The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012861
Eric Smith27bbca62010-11-04 17:06:58 +000012862PyDoc_STRVAR(format_map__doc__,
12863 "S.format_map(mapping) -> str\n\
12864\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012865Return a formatted version of S, using substitutions from mapping.\n\
12866The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012867
Eric Smith4a7d76d2008-05-30 18:10:19 +000012868static PyObject *
12869unicode__format__(PyObject* self, PyObject* args)
12870{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012871 PyObject *format_spec;
12872 _PyUnicodeWriter writer;
12873 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012874
12875 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12876 return NULL;
12877
Victor Stinnerd3f08822012-05-29 12:57:52 +020012878 if (PyUnicode_READY(self) == -1)
12879 return NULL;
12880 _PyUnicodeWriter_Init(&writer, 0);
12881 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12882 self, format_spec, 0,
12883 PyUnicode_GET_LENGTH(format_spec));
12884 if (ret == -1) {
12885 _PyUnicodeWriter_Dealloc(&writer);
12886 return NULL;
12887 }
12888 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012889}
12890
Eric Smith8c663262007-08-25 02:26:07 +000012891PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012892 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012893\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012894Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012895
12896static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012897unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012898{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012899 Py_ssize_t size;
12900
12901 /* If it's a compact object, account for base structure +
12902 character data. */
12903 if (PyUnicode_IS_COMPACT_ASCII(v))
12904 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12905 else if (PyUnicode_IS_COMPACT(v))
12906 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012907 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012908 else {
12909 /* If it is a two-block object, account for base object, and
12910 for character block if present. */
12911 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012912 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012913 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012914 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012915 }
12916 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012917 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012918 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012919 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012920 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012921 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012922
12923 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012924}
12925
12926PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012927 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012928
12929static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012930unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012931{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010012932 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 if (!copy)
12934 return NULL;
12935 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012936}
12937
Guido van Rossumd57fd912000-03-10 22:53:23 +000012938static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012939 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012940 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012941 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
12942 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012943 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12944 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050012945 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012946 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12947 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12948 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12949 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12950 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012951 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012952 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12953 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12954 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012955 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012956 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12957 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12958 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012959 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012960 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012961 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012962 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012963 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12964 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12965 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12966 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12967 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12968 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12969 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12970 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12971 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12972 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12973 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12974 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12975 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12976 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012977 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012978 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012979 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012980 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012981 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012982 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012983 {"maketrans", (PyCFunction) unicode_maketrans,
12984 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012985 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012986#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012987 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012988 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012989#endif
12990
Benjamin Peterson14339b62009-01-31 16:36:08 +000012991 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012992 {NULL, NULL}
12993};
12994
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012995static PyObject *
12996unicode_mod(PyObject *v, PyObject *w)
12997{
Brian Curtindfc80e32011-08-10 20:28:54 -050012998 if (!PyUnicode_Check(v))
12999 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013000 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013001}
13002
13003static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013004 0, /*nb_add*/
13005 0, /*nb_subtract*/
13006 0, /*nb_multiply*/
13007 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013008};
13009
Guido van Rossumd57fd912000-03-10 22:53:23 +000013010static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013011 (lenfunc) unicode_length, /* sq_length */
13012 PyUnicode_Concat, /* sq_concat */
13013 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13014 (ssizeargfunc) unicode_getitem, /* sq_item */
13015 0, /* sq_slice */
13016 0, /* sq_ass_item */
13017 0, /* sq_ass_slice */
13018 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013019};
13020
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013021static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013022unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013024 if (PyUnicode_READY(self) == -1)
13025 return NULL;
13026
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013027 if (PyIndex_Check(item)) {
13028 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013029 if (i == -1 && PyErr_Occurred())
13030 return NULL;
13031 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013032 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013033 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013034 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013035 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013036 PyObject *result;
13037 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013038 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013039 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013041 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013042 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013043 return NULL;
13044 }
13045
13046 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013047 Py_INCREF(unicode_empty);
13048 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013049 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013050 slicelength == PyUnicode_GET_LENGTH(self)) {
13051 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013052 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013053 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013054 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013055 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013056 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013057 src_kind = PyUnicode_KIND(self);
13058 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013059 if (!PyUnicode_IS_ASCII(self)) {
13060 kind_limit = kind_maxchar_limit(src_kind);
13061 max_char = 0;
13062 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13063 ch = PyUnicode_READ(src_kind, src_data, cur);
13064 if (ch > max_char) {
13065 max_char = ch;
13066 if (max_char >= kind_limit)
13067 break;
13068 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013069 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013070 }
Victor Stinner55c99112011-10-13 01:17:06 +020013071 else
13072 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013073 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013074 if (result == NULL)
13075 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013076 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013077 dest_data = PyUnicode_DATA(result);
13078
13079 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013080 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13081 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013082 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013083 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013084 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013085 } else {
13086 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13087 return NULL;
13088 }
13089}
13090
13091static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013092 (lenfunc)unicode_length, /* mp_length */
13093 (binaryfunc)unicode_subscript, /* mp_subscript */
13094 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013095};
13096
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098/* Helpers for PyUnicode_Format() */
13099
Victor Stinnera47082312012-10-04 02:19:54 +020013100struct unicode_formatter_t {
13101 PyObject *args;
13102 int args_owned;
13103 Py_ssize_t arglen, argidx;
13104 PyObject *dict;
13105
13106 enum PyUnicode_Kind fmtkind;
13107 Py_ssize_t fmtcnt, fmtpos;
13108 void *fmtdata;
13109 PyObject *fmtstr;
13110
13111 _PyUnicodeWriter writer;
13112};
13113
13114struct unicode_format_arg_t {
13115 Py_UCS4 ch;
13116 int flags;
13117 Py_ssize_t width;
13118 int prec;
13119 int sign;
13120};
13121
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013123unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124{
Victor Stinnera47082312012-10-04 02:19:54 +020013125 Py_ssize_t argidx = ctx->argidx;
13126
13127 if (argidx < ctx->arglen) {
13128 ctx->argidx++;
13129 if (ctx->arglen < 0)
13130 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013131 else
Victor Stinnera47082312012-10-04 02:19:54 +020013132 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133 }
13134 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013135 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136 return NULL;
13137}
13138
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013139/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140
Victor Stinnera47082312012-10-04 02:19:54 +020013141/* Format a float into the writer if the writer is not NULL, or into *p_output
13142 otherwise.
13143
13144 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013145static int
Victor Stinnera47082312012-10-04 02:19:54 +020013146formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13147 PyObject **p_output,
13148 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013149{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013150 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013152 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013153 int prec;
13154 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013155
Guido van Rossumd57fd912000-03-10 22:53:23 +000013156 x = PyFloat_AsDouble(v);
13157 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013158 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013159
Victor Stinnera47082312012-10-04 02:19:54 +020013160 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013161 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013162 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013163
Victor Stinnera47082312012-10-04 02:19:54 +020013164 if (arg->flags & F_ALT)
13165 dtoa_flags = Py_DTSF_ALT;
13166 else
13167 dtoa_flags = 0;
13168 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013169 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013170 return -1;
13171 len = strlen(p);
13172 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013173 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13174 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013175 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013176 }
Victor Stinner184252a2012-06-16 02:57:41 +020013177 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013178 writer->pos += len;
13179 }
13180 else
13181 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013182 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013183 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013184}
13185
Victor Stinnerd0880d52012-04-27 23:40:13 +020013186/* formatlong() emulates the format codes d, u, o, x and X, and
13187 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13188 * Python's regular ints.
13189 * Return value: a new PyUnicodeObject*, or NULL if error.
13190 * The output string is of the form
13191 * "-"? ("0x" | "0X")? digit+
13192 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13193 * set in flags. The case of hex digits will be correct,
13194 * There will be at least prec digits, zero-filled on the left if
13195 * necessary to get that many.
13196 * val object to be converted
13197 * flags bitmask of format flags; only F_ALT is looked at
13198 * prec minimum number of digits; 0-fill on left if needed
13199 * type a character in [duoxX]; u acts the same as d
13200 *
13201 * CAUTION: o, x and X conversions on regular ints can never
13202 * produce a '-' sign, but can for Python's unbounded ints.
13203 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013204static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013205formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013206{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013207 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013208 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013209 Py_ssize_t i;
13210 int sign; /* 1 if '-', else 0 */
13211 int len; /* number of characters */
13212 Py_ssize_t llen;
13213 int numdigits; /* len == numnondigits + numdigits */
13214 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013215 int prec = arg->prec;
13216 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013217
Victor Stinnerd0880d52012-04-27 23:40:13 +020013218 /* Avoid exceeding SSIZE_T_MAX */
13219 if (prec > INT_MAX-3) {
13220 PyErr_SetString(PyExc_OverflowError,
13221 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013222 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013223 }
13224
13225 assert(PyLong_Check(val));
13226
13227 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013228 default:
13229 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013230 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013231 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013232 case 'u':
13233 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013234 if (PyBool_Check(val))
13235 result = PyNumber_ToBase(val, 10);
13236 else
13237 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013238 break;
13239 case 'o':
13240 numnondigits = 2;
13241 result = PyNumber_ToBase(val, 8);
13242 break;
13243 case 'x':
13244 case 'X':
13245 numnondigits = 2;
13246 result = PyNumber_ToBase(val, 16);
13247 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013248 }
13249 if (!result)
13250 return NULL;
13251
13252 assert(unicode_modifiable(result));
13253 assert(PyUnicode_IS_READY(result));
13254 assert(PyUnicode_IS_ASCII(result));
13255
13256 /* To modify the string in-place, there can only be one reference. */
13257 if (Py_REFCNT(result) != 1) {
13258 PyErr_BadInternalCall();
13259 return NULL;
13260 }
13261 buf = PyUnicode_DATA(result);
13262 llen = PyUnicode_GET_LENGTH(result);
13263 if (llen > INT_MAX) {
13264 PyErr_SetString(PyExc_ValueError,
13265 "string too large in _PyBytes_FormatLong");
13266 return NULL;
13267 }
13268 len = (int)llen;
13269 sign = buf[0] == '-';
13270 numnondigits += sign;
13271 numdigits = len - numnondigits;
13272 assert(numdigits > 0);
13273
13274 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013275 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013276 (type == 'o' || type == 'x' || type == 'X'))) {
13277 assert(buf[sign] == '0');
13278 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13279 buf[sign+1] == 'o');
13280 numnondigits -= 2;
13281 buf += 2;
13282 len -= 2;
13283 if (sign)
13284 buf[0] = '-';
13285 assert(len == numnondigits + numdigits);
13286 assert(numdigits > 0);
13287 }
13288
13289 /* Fill with leading zeroes to meet minimum width. */
13290 if (prec > numdigits) {
13291 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13292 numnondigits + prec);
13293 char *b1;
13294 if (!r1) {
13295 Py_DECREF(result);
13296 return NULL;
13297 }
13298 b1 = PyBytes_AS_STRING(r1);
13299 for (i = 0; i < numnondigits; ++i)
13300 *b1++ = *buf++;
13301 for (i = 0; i < prec - numdigits; i++)
13302 *b1++ = '0';
13303 for (i = 0; i < numdigits; i++)
13304 *b1++ = *buf++;
13305 *b1 = '\0';
13306 Py_DECREF(result);
13307 result = r1;
13308 buf = PyBytes_AS_STRING(result);
13309 len = numnondigits + prec;
13310 }
13311
13312 /* Fix up case for hex conversions. */
13313 if (type == 'X') {
13314 /* Need to convert all lower case letters to upper case.
13315 and need to convert 0x to 0X (and -0x to -0X). */
13316 for (i = 0; i < len; i++)
13317 if (buf[i] >= 'a' && buf[i] <= 'x')
13318 buf[i] -= 'a'-'A';
13319 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013320 if (!PyUnicode_Check(result)
13321 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013322 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013323 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013324 Py_DECREF(result);
13325 result = unicode;
13326 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013327 else if (len != PyUnicode_GET_LENGTH(result)) {
13328 if (PyUnicode_Resize(&result, len) < 0)
13329 Py_CLEAR(result);
13330 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013331 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013332}
13333
Victor Stinner621ef3d2012-10-02 00:33:47 +020013334/* Format an integer.
13335 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013336 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013337 * -1 and raise an exception on error */
13338static int
Victor Stinnera47082312012-10-04 02:19:54 +020013339mainformatlong(PyObject *v,
13340 struct unicode_format_arg_t *arg,
13341 PyObject **p_output,
13342 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013343{
13344 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013345 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013346
13347 if (!PyNumber_Check(v))
13348 goto wrongtype;
13349
13350 if (!PyLong_Check(v)) {
13351 iobj = PyNumber_Long(v);
13352 if (iobj == NULL) {
13353 if (PyErr_ExceptionMatches(PyExc_TypeError))
13354 goto wrongtype;
13355 return -1;
13356 }
13357 assert(PyLong_Check(iobj));
13358 }
13359 else {
13360 iobj = v;
13361 Py_INCREF(iobj);
13362 }
13363
13364 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013365 && arg->width == -1 && arg->prec == -1
13366 && !(arg->flags & (F_SIGN | F_BLANK))
13367 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013368 {
13369 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013370 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013371 int base;
13372
Victor Stinnera47082312012-10-04 02:19:54 +020013373 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013374 {
13375 default:
13376 assert(0 && "'type' not in [diuoxX]");
13377 case 'd':
13378 case 'i':
13379 case 'u':
13380 base = 10;
13381 break;
13382 case 'o':
13383 base = 8;
13384 break;
13385 case 'x':
13386 case 'X':
13387 base = 16;
13388 break;
13389 }
13390
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013391 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13392 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013393 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013394 }
13395 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013396 return 1;
13397 }
13398
Victor Stinnera47082312012-10-04 02:19:54 +020013399 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013400 Py_DECREF(iobj);
13401 if (res == NULL)
13402 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013403 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013404 return 0;
13405
13406wrongtype:
13407 PyErr_Format(PyExc_TypeError,
13408 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013409 "not %.200s",
13410 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013411 return -1;
13412}
13413
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013414static Py_UCS4
13415formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013416{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013417 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013418 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013419 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013420 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013421 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013422 goto onError;
13423 }
13424 else {
13425 /* Integer input truncated to a character */
13426 long x;
13427 x = PyLong_AsLong(v);
13428 if (x == -1 && PyErr_Occurred())
13429 goto onError;
13430
Victor Stinner8faf8212011-12-08 22:14:11 +010013431 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013432 PyErr_SetString(PyExc_OverflowError,
13433 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013434 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013435 }
13436
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013437 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013438 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013439
Benjamin Peterson29060642009-01-31 22:14:21 +000013440 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013441 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013442 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013443 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013444}
13445
Victor Stinnera47082312012-10-04 02:19:54 +020013446/* Parse options of an argument: flags, width, precision.
13447 Handle also "%(name)" syntax.
13448
13449 Return 0 if the argument has been formatted into arg->str.
13450 Return 1 if the argument has been written into ctx->writer,
13451 Raise an exception and return -1 on error. */
13452static int
13453unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13454 struct unicode_format_arg_t *arg)
13455{
13456#define FORMAT_READ(ctx) \
13457 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13458
13459 PyObject *v;
13460
13461 arg->ch = FORMAT_READ(ctx);
13462 if (arg->ch == '(') {
13463 /* Get argument value from a dictionary. Example: "%(name)s". */
13464 Py_ssize_t keystart;
13465 Py_ssize_t keylen;
13466 PyObject *key;
13467 int pcount = 1;
13468
13469 if (ctx->dict == NULL) {
13470 PyErr_SetString(PyExc_TypeError,
13471 "format requires a mapping");
13472 return -1;
13473 }
13474 ++ctx->fmtpos;
13475 --ctx->fmtcnt;
13476 keystart = ctx->fmtpos;
13477 /* Skip over balanced parentheses */
13478 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13479 arg->ch = FORMAT_READ(ctx);
13480 if (arg->ch == ')')
13481 --pcount;
13482 else if (arg->ch == '(')
13483 ++pcount;
13484 ctx->fmtpos++;
13485 }
13486 keylen = ctx->fmtpos - keystart - 1;
13487 if (ctx->fmtcnt < 0 || pcount > 0) {
13488 PyErr_SetString(PyExc_ValueError,
13489 "incomplete format key");
13490 return -1;
13491 }
13492 key = PyUnicode_Substring(ctx->fmtstr,
13493 keystart, keystart + keylen);
13494 if (key == NULL)
13495 return -1;
13496 if (ctx->args_owned) {
13497 Py_DECREF(ctx->args);
13498 ctx->args_owned = 0;
13499 }
13500 ctx->args = PyObject_GetItem(ctx->dict, key);
13501 Py_DECREF(key);
13502 if (ctx->args == NULL)
13503 return -1;
13504 ctx->args_owned = 1;
13505 ctx->arglen = -1;
13506 ctx->argidx = -2;
13507 }
13508
13509 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
13510 arg->flags = 0;
13511 while (--ctx->fmtcnt >= 0) {
13512 arg->ch = FORMAT_READ(ctx);
13513 ctx->fmtpos++;
13514 switch (arg->ch) {
13515 case '-': arg->flags |= F_LJUST; continue;
13516 case '+': arg->flags |= F_SIGN; continue;
13517 case ' ': arg->flags |= F_BLANK; continue;
13518 case '#': arg->flags |= F_ALT; continue;
13519 case '0': arg->flags |= F_ZERO; continue;
13520 }
13521 break;
13522 }
13523
13524 /* Parse width. Example: "%10s" => width=10 */
13525 arg->width = -1;
13526 if (arg->ch == '*') {
13527 v = unicode_format_getnextarg(ctx);
13528 if (v == NULL)
13529 return -1;
13530 if (!PyLong_Check(v)) {
13531 PyErr_SetString(PyExc_TypeError,
13532 "* wants int");
13533 return -1;
13534 }
13535 arg->width = PyLong_AsLong(v);
13536 if (arg->width == -1 && PyErr_Occurred())
13537 return -1;
13538 if (arg->width < 0) {
13539 arg->flags |= F_LJUST;
13540 arg->width = -arg->width;
13541 }
13542 if (--ctx->fmtcnt >= 0) {
13543 arg->ch = FORMAT_READ(ctx);
13544 ctx->fmtpos++;
13545 }
13546 }
13547 else if (arg->ch >= '0' && arg->ch <= '9') {
13548 arg->width = arg->ch - '0';
13549 while (--ctx->fmtcnt >= 0) {
13550 arg->ch = FORMAT_READ(ctx);
13551 ctx->fmtpos++;
13552 if (arg->ch < '0' || arg->ch > '9')
13553 break;
13554 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13555 mixing signed and unsigned comparison. Since arg->ch is between
13556 '0' and '9', casting to int is safe. */
13557 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13558 PyErr_SetString(PyExc_ValueError,
13559 "width too big");
13560 return -1;
13561 }
13562 arg->width = arg->width*10 + (arg->ch - '0');
13563 }
13564 }
13565
13566 /* Parse precision. Example: "%.3f" => prec=3 */
13567 arg->prec = -1;
13568 if (arg->ch == '.') {
13569 arg->prec = 0;
13570 if (--ctx->fmtcnt >= 0) {
13571 arg->ch = FORMAT_READ(ctx);
13572 ctx->fmtpos++;
13573 }
13574 if (arg->ch == '*') {
13575 v = unicode_format_getnextarg(ctx);
13576 if (v == NULL)
13577 return -1;
13578 if (!PyLong_Check(v)) {
13579 PyErr_SetString(PyExc_TypeError,
13580 "* wants int");
13581 return -1;
13582 }
13583 arg->prec = PyLong_AsLong(v);
13584 if (arg->prec == -1 && PyErr_Occurred())
13585 return -1;
13586 if (arg->prec < 0)
13587 arg->prec = 0;
13588 if (--ctx->fmtcnt >= 0) {
13589 arg->ch = FORMAT_READ(ctx);
13590 ctx->fmtpos++;
13591 }
13592 }
13593 else if (arg->ch >= '0' && arg->ch <= '9') {
13594 arg->prec = arg->ch - '0';
13595 while (--ctx->fmtcnt >= 0) {
13596 arg->ch = FORMAT_READ(ctx);
13597 ctx->fmtpos++;
13598 if (arg->ch < '0' || arg->ch > '9')
13599 break;
13600 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13601 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020013602 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020013603 return -1;
13604 }
13605 arg->prec = arg->prec*10 + (arg->ch - '0');
13606 }
13607 }
13608 }
13609
13610 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13611 if (ctx->fmtcnt >= 0) {
13612 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13613 if (--ctx->fmtcnt >= 0) {
13614 arg->ch = FORMAT_READ(ctx);
13615 ctx->fmtpos++;
13616 }
13617 }
13618 }
13619 if (ctx->fmtcnt < 0) {
13620 PyErr_SetString(PyExc_ValueError,
13621 "incomplete format");
13622 return -1;
13623 }
13624 return 0;
13625
13626#undef FORMAT_READ
13627}
13628
13629/* Format one argument. Supported conversion specifiers:
13630
13631 - "s", "r", "a": any type
13632 - "i", "d", "u", "o", "x", "X": int
13633 - "e", "E", "f", "F", "g", "G": float
13634 - "c": int or str (1 character)
13635
13636 Return 0 if the argument has been formatted into *p_str,
13637 1 if the argument has been written into ctx->writer,
13638 -1 on error. */
13639static int
13640unicode_format_arg_format(struct unicode_formatter_t *ctx,
13641 struct unicode_format_arg_t *arg,
13642 PyObject **p_str)
13643{
13644 PyObject *v;
13645 _PyUnicodeWriter *writer = &ctx->writer;
13646
13647 if (ctx->fmtcnt == 0)
13648 ctx->writer.overallocate = 0;
13649
13650 if (arg->ch == '%') {
13651 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1)
13652 return -1;
13653 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
13654 writer->pos += 1;
13655 return 1;
13656 }
13657
13658 v = unicode_format_getnextarg(ctx);
13659 if (v == NULL)
13660 return -1;
13661
13662 arg->sign = 0;
13663
13664 switch (arg->ch) {
13665
13666 case 's':
13667 case 'r':
13668 case 'a':
13669 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13670 /* Fast path */
13671 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13672 return -1;
13673 return 1;
13674 }
13675
13676 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13677 *p_str = v;
13678 Py_INCREF(*p_str);
13679 }
13680 else {
13681 if (arg->ch == 's')
13682 *p_str = PyObject_Str(v);
13683 else if (arg->ch == 'r')
13684 *p_str = PyObject_Repr(v);
13685 else
13686 *p_str = PyObject_ASCII(v);
13687 }
13688 break;
13689
13690 case 'i':
13691 case 'd':
13692 case 'u':
13693 case 'o':
13694 case 'x':
13695 case 'X':
13696 {
13697 int ret = mainformatlong(v, arg, p_str, writer);
13698 if (ret != 0)
13699 return ret;
13700 arg->sign = 1;
13701 break;
13702 }
13703
13704 case 'e':
13705 case 'E':
13706 case 'f':
13707 case 'F':
13708 case 'g':
13709 case 'G':
13710 if (arg->width == -1 && arg->prec == -1
13711 && !(arg->flags & (F_SIGN | F_BLANK)))
13712 {
13713 /* Fast path */
13714 if (formatfloat(v, arg, NULL, writer) == -1)
13715 return -1;
13716 return 1;
13717 }
13718
13719 arg->sign = 1;
13720 if (formatfloat(v, arg, p_str, NULL) == -1)
13721 return -1;
13722 break;
13723
13724 case 'c':
13725 {
13726 Py_UCS4 ch = formatchar(v);
13727 if (ch == (Py_UCS4) -1)
13728 return -1;
13729 if (arg->width == -1 && arg->prec == -1) {
13730 /* Fast path */
13731 if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1)
13732 return -1;
13733 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13734 writer->pos += 1;
13735 return 1;
13736 }
13737 *p_str = PyUnicode_FromOrdinal(ch);
13738 break;
13739 }
13740
13741 default:
13742 PyErr_Format(PyExc_ValueError,
13743 "unsupported format character '%c' (0x%x) "
13744 "at index %zd",
13745 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13746 (int)arg->ch,
13747 ctx->fmtpos - 1);
13748 return -1;
13749 }
13750 if (*p_str == NULL)
13751 return -1;
13752 assert (PyUnicode_Check(*p_str));
13753 return 0;
13754}
13755
13756static int
13757unicode_format_arg_output(struct unicode_formatter_t *ctx,
13758 struct unicode_format_arg_t *arg,
13759 PyObject *str)
13760{
13761 Py_ssize_t len;
13762 enum PyUnicode_Kind kind;
13763 void *pbuf;
13764 Py_ssize_t pindex;
13765 Py_UCS4 signchar;
13766 Py_ssize_t buflen;
13767 Py_UCS4 maxchar, bufmaxchar;
13768 Py_ssize_t sublen;
13769 _PyUnicodeWriter *writer = &ctx->writer;
13770 Py_UCS4 fill;
13771
13772 fill = ' ';
13773 if (arg->sign && arg->flags & F_ZERO)
13774 fill = '0';
13775
13776 if (PyUnicode_READY(str) == -1)
13777 return -1;
13778
13779 len = PyUnicode_GET_LENGTH(str);
13780 if ((arg->width == -1 || arg->width <= len)
13781 && (arg->prec == -1 || arg->prec >= len)
13782 && !(arg->flags & (F_SIGN | F_BLANK)))
13783 {
13784 /* Fast path */
13785 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13786 return -1;
13787 return 0;
13788 }
13789
13790 /* Truncate the string for "s", "r" and "a" formats
13791 if the precision is set */
13792 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13793 if (arg->prec >= 0 && len > arg->prec)
13794 len = arg->prec;
13795 }
13796
13797 /* Adjust sign and width */
13798 kind = PyUnicode_KIND(str);
13799 pbuf = PyUnicode_DATA(str);
13800 pindex = 0;
13801 signchar = '\0';
13802 if (arg->sign) {
13803 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13804 if (ch == '-' || ch == '+') {
13805 signchar = ch;
13806 len--;
13807 pindex++;
13808 }
13809 else if (arg->flags & F_SIGN)
13810 signchar = '+';
13811 else if (arg->flags & F_BLANK)
13812 signchar = ' ';
13813 else
13814 arg->sign = 0;
13815 }
13816 if (arg->width < len)
13817 arg->width = len;
13818
13819 /* Prepare the writer */
13820 bufmaxchar = 127;
13821 if (!(arg->flags & F_LJUST)) {
13822 if (arg->sign) {
13823 if ((arg->width-1) > len)
13824 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13825 }
13826 else {
13827 if (arg->width > len)
13828 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13829 }
13830 }
13831 maxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
13832 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
13833 buflen = arg->width;
13834 if (arg->sign && len == arg->width)
13835 buflen++;
13836 if (_PyUnicodeWriter_Prepare(writer, buflen, bufmaxchar) == -1)
13837 return -1;
13838
13839 /* Write the sign if needed */
13840 if (arg->sign) {
13841 if (fill != ' ') {
13842 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13843 writer->pos += 1;
13844 }
13845 if (arg->width > len)
13846 arg->width--;
13847 }
13848
13849 /* Write the numeric prefix for "x", "X" and "o" formats
13850 if the alternate form is used.
13851 For example, write "0x" for the "%#x" format. */
13852 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13853 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13854 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
13855 if (fill != ' ') {
13856 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13857 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13858 writer->pos += 2;
13859 pindex += 2;
13860 }
13861 arg->width -= 2;
13862 if (arg->width < 0)
13863 arg->width = 0;
13864 len -= 2;
13865 }
13866
13867 /* Pad left with the fill character if needed */
13868 if (arg->width > len && !(arg->flags & F_LJUST)) {
13869 sublen = arg->width - len;
13870 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
13871 writer->pos += sublen;
13872 arg->width = len;
13873 }
13874
13875 /* If padding with spaces: write sign if needed and/or numeric prefix if
13876 the alternate form is used */
13877 if (fill == ' ') {
13878 if (arg->sign) {
13879 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13880 writer->pos += 1;
13881 }
13882 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
13883 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13884 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
13885 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
13886 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
13887 writer->pos += 2;
13888 pindex += 2;
13889 }
13890 }
13891
13892 /* Write characters */
13893 if (len) {
13894 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13895 str, pindex, len);
13896 writer->pos += len;
13897 }
13898
13899 /* Pad right with the fill character if needed */
13900 if (arg->width > len) {
13901 sublen = arg->width - len;
13902 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
13903 writer->pos += sublen;
13904 }
13905 return 0;
13906}
13907
13908/* Helper of PyUnicode_Format(): format one arg.
13909 Return 0 on success, raise an exception and return -1 on error. */
13910static int
13911unicode_format_arg(struct unicode_formatter_t *ctx)
13912{
13913 struct unicode_format_arg_t arg;
13914 PyObject *str;
13915 int ret;
13916
13917 ret = unicode_format_arg_parse(ctx, &arg);
13918 if (ret == -1)
13919 return -1;
13920
13921 ret = unicode_format_arg_format(ctx, &arg, &str);
13922 if (ret == -1)
13923 return -1;
13924
13925 if (ret != 1) {
13926 ret = unicode_format_arg_output(ctx, &arg, str);
13927 Py_DECREF(str);
13928 if (ret == -1)
13929 return -1;
13930 }
13931
13932 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
13933 PyErr_SetString(PyExc_TypeError,
13934 "not all arguments converted during string formatting");
13935 return -1;
13936 }
13937 return 0;
13938}
13939
Alexander Belopolsky40018472011-02-26 01:02:56 +000013940PyObject *
13941PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013942{
Victor Stinnera47082312012-10-04 02:19:54 +020013943 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000013944
Guido van Rossumd57fd912000-03-10 22:53:23 +000013945 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013946 PyErr_BadInternalCall();
13947 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013948 }
Victor Stinnera47082312012-10-04 02:19:54 +020013949
13950 ctx.fmtstr = PyUnicode_FromObject(format);
13951 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013952 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020013953 if (PyUnicode_READY(ctx.fmtstr) == -1) {
13954 Py_DECREF(ctx.fmtstr);
13955 return NULL;
13956 }
13957 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
13958 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
13959 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
13960 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013961
Victor Stinnera47082312012-10-04 02:19:54 +020013962 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013963
Guido van Rossumd57fd912000-03-10 22:53:23 +000013964 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020013965 ctx.arglen = PyTuple_Size(args);
13966 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013967 }
13968 else {
Victor Stinnera47082312012-10-04 02:19:54 +020013969 ctx.arglen = -1;
13970 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013971 }
Victor Stinnera47082312012-10-04 02:19:54 +020013972 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013973 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020013974 ctx.dict = args;
13975 else
13976 ctx.dict = NULL;
13977 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013978
Victor Stinnera47082312012-10-04 02:19:54 +020013979 while (--ctx.fmtcnt >= 0) {
13980 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13981 Py_ssize_t nonfmtpos, sublen;
13982 Py_UCS4 maxchar;
13983
13984 nonfmtpos = ctx.fmtpos++;
13985 while (ctx.fmtcnt >= 0 &&
13986 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
13987 ctx.fmtpos++;
13988 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013989 }
Victor Stinnera47082312012-10-04 02:19:54 +020013990 if (ctx.fmtcnt < 0) {
13991 ctx.fmtpos--;
13992 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020013993 }
Victor Stinnera47082312012-10-04 02:19:54 +020013994 sublen = ctx.fmtpos - nonfmtpos;
13995 maxchar = _PyUnicode_FindMaxChar(ctx.fmtstr,
Victor Stinneree4544c2012-05-09 22:24:08 +020013996 nonfmtpos, nonfmtpos + sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020013997 if (_PyUnicodeWriter_Prepare(&ctx.writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013998 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013999
Victor Stinnera47082312012-10-04 02:19:54 +020014000 _PyUnicode_FastCopyCharacters(ctx.writer.buffer, ctx.writer.pos,
14001 ctx.fmtstr, nonfmtpos, sublen);
14002 ctx.writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014003 }
14004 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014005 ctx.fmtpos++;
14006 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014007 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014008 }
14009 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014010
Victor Stinnera47082312012-10-04 02:19:54 +020014011 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014012 PyErr_SetString(PyExc_TypeError,
14013 "not all arguments converted during string formatting");
14014 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014015 }
14016
Victor Stinnera47082312012-10-04 02:19:54 +020014017 if (ctx.args_owned) {
14018 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014019 }
Victor Stinnera47082312012-10-04 02:19:54 +020014020 Py_DECREF(ctx.fmtstr);
14021 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014022
Benjamin Peterson29060642009-01-31 22:14:21 +000014023 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014024 Py_DECREF(ctx.fmtstr);
14025 _PyUnicodeWriter_Dealloc(&ctx.writer);
14026 if (ctx.args_owned) {
14027 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014028 }
14029 return NULL;
14030}
14031
Jeremy Hylton938ace62002-07-17 16:30:39 +000014032static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014033unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14034
Tim Peters6d6c1a32001-08-02 04:15:00 +000014035static PyObject *
14036unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14037{
Benjamin Peterson29060642009-01-31 22:14:21 +000014038 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014039 static char *kwlist[] = {"object", "encoding", "errors", 0};
14040 char *encoding = NULL;
14041 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014042
Benjamin Peterson14339b62009-01-31 16:36:08 +000014043 if (type != &PyUnicode_Type)
14044 return unicode_subtype_new(type, args, kwds);
14045 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014046 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014047 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014048 if (x == NULL) {
14049 Py_INCREF(unicode_empty);
14050 return unicode_empty;
14051 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014052 if (encoding == NULL && errors == NULL)
14053 return PyObject_Str(x);
14054 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014055 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014056}
14057
Guido van Rossume023fe02001-08-30 03:12:59 +000014058static PyObject *
14059unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14060{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014061 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014062 Py_ssize_t length, char_size;
14063 int share_wstr, share_utf8;
14064 unsigned int kind;
14065 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014066
Benjamin Peterson14339b62009-01-31 16:36:08 +000014067 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014068
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014069 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014070 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014071 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014072 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014073 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014074 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014075 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014076 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014077
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014078 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014079 if (self == NULL) {
14080 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014081 return NULL;
14082 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014083 kind = PyUnicode_KIND(unicode);
14084 length = PyUnicode_GET_LENGTH(unicode);
14085
14086 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014087#ifdef Py_DEBUG
14088 _PyUnicode_HASH(self) = -1;
14089#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014090 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014091#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014092 _PyUnicode_STATE(self).interned = 0;
14093 _PyUnicode_STATE(self).kind = kind;
14094 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014095 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014096 _PyUnicode_STATE(self).ready = 1;
14097 _PyUnicode_WSTR(self) = NULL;
14098 _PyUnicode_UTF8_LENGTH(self) = 0;
14099 _PyUnicode_UTF8(self) = NULL;
14100 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014101 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014102
14103 share_utf8 = 0;
14104 share_wstr = 0;
14105 if (kind == PyUnicode_1BYTE_KIND) {
14106 char_size = 1;
14107 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14108 share_utf8 = 1;
14109 }
14110 else if (kind == PyUnicode_2BYTE_KIND) {
14111 char_size = 2;
14112 if (sizeof(wchar_t) == 2)
14113 share_wstr = 1;
14114 }
14115 else {
14116 assert(kind == PyUnicode_4BYTE_KIND);
14117 char_size = 4;
14118 if (sizeof(wchar_t) == 4)
14119 share_wstr = 1;
14120 }
14121
14122 /* Ensure we won't overflow the length. */
14123 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14124 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014125 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014126 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014127 data = PyObject_MALLOC((length + 1) * char_size);
14128 if (data == NULL) {
14129 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014130 goto onError;
14131 }
14132
Victor Stinnerc3c74152011-10-02 20:39:55 +020014133 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014134 if (share_utf8) {
14135 _PyUnicode_UTF8_LENGTH(self) = length;
14136 _PyUnicode_UTF8(self) = data;
14137 }
14138 if (share_wstr) {
14139 _PyUnicode_WSTR_LENGTH(self) = length;
14140 _PyUnicode_WSTR(self) = (wchar_t *)data;
14141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014142
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014143 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014144 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014145 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014146#ifdef Py_DEBUG
14147 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14148#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014149 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014150 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014151
14152onError:
14153 Py_DECREF(unicode);
14154 Py_DECREF(self);
14155 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014156}
14157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014158PyDoc_STRVAR(unicode_doc,
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014159 "str(object[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014160\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014161Create a new string object from the given object. If encoding or\n\
14162errors is specified, then the object must expose a data buffer\n\
14163that will be decoded using the given encoding and error handler.\n\
14164Otherwise, returns the result of object.__str__() (if defined)\n\
14165or repr(object).\n\
14166encoding defaults to sys.getdefaultencoding().\n\
14167errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014168
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014169static PyObject *unicode_iter(PyObject *seq);
14170
Guido van Rossumd57fd912000-03-10 22:53:23 +000014171PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014172 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014173 "str", /* tp_name */
14174 sizeof(PyUnicodeObject), /* tp_size */
14175 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014176 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014177 (destructor)unicode_dealloc, /* tp_dealloc */
14178 0, /* tp_print */
14179 0, /* tp_getattr */
14180 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014181 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014182 unicode_repr, /* tp_repr */
14183 &unicode_as_number, /* tp_as_number */
14184 &unicode_as_sequence, /* tp_as_sequence */
14185 &unicode_as_mapping, /* tp_as_mapping */
14186 (hashfunc) unicode_hash, /* tp_hash*/
14187 0, /* tp_call*/
14188 (reprfunc) unicode_str, /* tp_str */
14189 PyObject_GenericGetAttr, /* tp_getattro */
14190 0, /* tp_setattro */
14191 0, /* tp_as_buffer */
14192 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014193 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014194 unicode_doc, /* tp_doc */
14195 0, /* tp_traverse */
14196 0, /* tp_clear */
14197 PyUnicode_RichCompare, /* tp_richcompare */
14198 0, /* tp_weaklistoffset */
14199 unicode_iter, /* tp_iter */
14200 0, /* tp_iternext */
14201 unicode_methods, /* tp_methods */
14202 0, /* tp_members */
14203 0, /* tp_getset */
14204 &PyBaseObject_Type, /* tp_base */
14205 0, /* tp_dict */
14206 0, /* tp_descr_get */
14207 0, /* tp_descr_set */
14208 0, /* tp_dictoffset */
14209 0, /* tp_init */
14210 0, /* tp_alloc */
14211 unicode_new, /* tp_new */
14212 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014213};
14214
14215/* Initialize the Unicode implementation */
14216
Victor Stinner3a50e702011-10-18 21:21:00 +020014217int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014218{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014219 int i;
14220
Thomas Wouters477c8d52006-05-27 19:21:47 +000014221 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014222 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014223 0x000A, /* LINE FEED */
14224 0x000D, /* CARRIAGE RETURN */
14225 0x001C, /* FILE SEPARATOR */
14226 0x001D, /* GROUP SEPARATOR */
14227 0x001E, /* RECORD SEPARATOR */
14228 0x0085, /* NEXT LINE */
14229 0x2028, /* LINE SEPARATOR */
14230 0x2029, /* PARAGRAPH SEPARATOR */
14231 };
14232
Fred Drakee4315f52000-05-09 19:53:39 +000014233 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014234 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014235 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014236 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014237 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014238
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014239 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014240 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014241 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014242 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014243
14244 /* initialize the linebreak bloom filter */
14245 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014246 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014247 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014248
14249 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014250
14251#ifdef HAVE_MBCS
14252 winver.dwOSVersionInfoSize = sizeof(winver);
14253 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14254 PyErr_SetFromWindowsErr(0);
14255 return -1;
14256 }
14257#endif
14258 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014259}
14260
14261/* Finalize the Unicode implementation */
14262
Christian Heimesa156e092008-02-16 07:38:31 +000014263int
14264PyUnicode_ClearFreeList(void)
14265{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014266 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014267}
14268
Guido van Rossumd57fd912000-03-10 22:53:23 +000014269void
Thomas Wouters78890102000-07-22 19:25:51 +000014270_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014271{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014272 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014273
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014274 Py_XDECREF(unicode_empty);
14275 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014276
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014277 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014278 if (unicode_latin1[i]) {
14279 Py_DECREF(unicode_latin1[i]);
14280 unicode_latin1[i] = NULL;
14281 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014282 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014283 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014284 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014285}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014286
Walter Dörwald16807132007-05-25 13:52:07 +000014287void
14288PyUnicode_InternInPlace(PyObject **p)
14289{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014290 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014291 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014292#ifdef Py_DEBUG
14293 assert(s != NULL);
14294 assert(_PyUnicode_CHECK(s));
14295#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014296 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014297 return;
14298#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014299 /* If it's a subclass, we don't really know what putting
14300 it in the interned dict might do. */
14301 if (!PyUnicode_CheckExact(s))
14302 return;
14303 if (PyUnicode_CHECK_INTERNED(s))
14304 return;
14305 if (interned == NULL) {
14306 interned = PyDict_New();
14307 if (interned == NULL) {
14308 PyErr_Clear(); /* Don't leave an exception */
14309 return;
14310 }
14311 }
14312 /* It might be that the GetItem call fails even
14313 though the key is present in the dictionary,
14314 namely when this happens during a stack overflow. */
14315 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014316 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014317 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014318
Benjamin Peterson29060642009-01-31 22:14:21 +000014319 if (t) {
14320 Py_INCREF(t);
14321 Py_DECREF(*p);
14322 *p = t;
14323 return;
14324 }
Walter Dörwald16807132007-05-25 13:52:07 +000014325
Benjamin Peterson14339b62009-01-31 16:36:08 +000014326 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014327 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014328 PyErr_Clear();
14329 PyThreadState_GET()->recursion_critical = 0;
14330 return;
14331 }
14332 PyThreadState_GET()->recursion_critical = 0;
14333 /* The two references in interned are not counted by refcnt.
14334 The deallocator will take care of this */
14335 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014336 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014337}
14338
14339void
14340PyUnicode_InternImmortal(PyObject **p)
14341{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014342 PyUnicode_InternInPlace(p);
14343 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014344 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014345 Py_INCREF(*p);
14346 }
Walter Dörwald16807132007-05-25 13:52:07 +000014347}
14348
14349PyObject *
14350PyUnicode_InternFromString(const char *cp)
14351{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014352 PyObject *s = PyUnicode_FromString(cp);
14353 if (s == NULL)
14354 return NULL;
14355 PyUnicode_InternInPlace(&s);
14356 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014357}
14358
Alexander Belopolsky40018472011-02-26 01:02:56 +000014359void
14360_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014361{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014362 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014363 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014364 Py_ssize_t i, n;
14365 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014366
Benjamin Peterson14339b62009-01-31 16:36:08 +000014367 if (interned == NULL || !PyDict_Check(interned))
14368 return;
14369 keys = PyDict_Keys(interned);
14370 if (keys == NULL || !PyList_Check(keys)) {
14371 PyErr_Clear();
14372 return;
14373 }
Walter Dörwald16807132007-05-25 13:52:07 +000014374
Benjamin Peterson14339b62009-01-31 16:36:08 +000014375 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14376 detector, interned unicode strings are not forcibly deallocated;
14377 rather, we give them their stolen references back, and then clear
14378 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014379
Benjamin Peterson14339b62009-01-31 16:36:08 +000014380 n = PyList_GET_SIZE(keys);
14381 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014382 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014383 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014384 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014385 if (PyUnicode_READY(s) == -1) {
14386 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014387 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014388 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014389 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014390 case SSTATE_NOT_INTERNED:
14391 /* XXX Shouldn't happen */
14392 break;
14393 case SSTATE_INTERNED_IMMORTAL:
14394 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014395 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014396 break;
14397 case SSTATE_INTERNED_MORTAL:
14398 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014399 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014400 break;
14401 default:
14402 Py_FatalError("Inconsistent interned string state.");
14403 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014404 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014405 }
14406 fprintf(stderr, "total size of all interned strings: "
14407 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14408 "mortal/immortal\n", mortal_size, immortal_size);
14409 Py_DECREF(keys);
14410 PyDict_Clear(interned);
14411 Py_DECREF(interned);
14412 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014413}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014414
14415
14416/********************* Unicode Iterator **************************/
14417
14418typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014419 PyObject_HEAD
14420 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014421 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014422} unicodeiterobject;
14423
14424static void
14425unicodeiter_dealloc(unicodeiterobject *it)
14426{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014427 _PyObject_GC_UNTRACK(it);
14428 Py_XDECREF(it->it_seq);
14429 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014430}
14431
14432static int
14433unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14434{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014435 Py_VISIT(it->it_seq);
14436 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014437}
14438
14439static PyObject *
14440unicodeiter_next(unicodeiterobject *it)
14441{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014442 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014443
Benjamin Peterson14339b62009-01-31 16:36:08 +000014444 assert(it != NULL);
14445 seq = it->it_seq;
14446 if (seq == NULL)
14447 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014448 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014450 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14451 int kind = PyUnicode_KIND(seq);
14452 void *data = PyUnicode_DATA(seq);
14453 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14454 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014455 if (item != NULL)
14456 ++it->it_index;
14457 return item;
14458 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014459
Benjamin Peterson14339b62009-01-31 16:36:08 +000014460 Py_DECREF(seq);
14461 it->it_seq = NULL;
14462 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014463}
14464
14465static PyObject *
14466unicodeiter_len(unicodeiterobject *it)
14467{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014468 Py_ssize_t len = 0;
14469 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014470 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014471 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014472}
14473
14474PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14475
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014476static PyObject *
14477unicodeiter_reduce(unicodeiterobject *it)
14478{
14479 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014480 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014481 it->it_seq, it->it_index);
14482 } else {
14483 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14484 if (u == NULL)
14485 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014486 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014487 }
14488}
14489
14490PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14491
14492static PyObject *
14493unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14494{
14495 Py_ssize_t index = PyLong_AsSsize_t(state);
14496 if (index == -1 && PyErr_Occurred())
14497 return NULL;
14498 if (index < 0)
14499 index = 0;
14500 it->it_index = index;
14501 Py_RETURN_NONE;
14502}
14503
14504PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14505
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014506static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014507 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014508 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014509 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14510 reduce_doc},
14511 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14512 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014513 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014514};
14515
14516PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014517 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14518 "str_iterator", /* tp_name */
14519 sizeof(unicodeiterobject), /* tp_basicsize */
14520 0, /* tp_itemsize */
14521 /* methods */
14522 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14523 0, /* tp_print */
14524 0, /* tp_getattr */
14525 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014526 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014527 0, /* tp_repr */
14528 0, /* tp_as_number */
14529 0, /* tp_as_sequence */
14530 0, /* tp_as_mapping */
14531 0, /* tp_hash */
14532 0, /* tp_call */
14533 0, /* tp_str */
14534 PyObject_GenericGetAttr, /* tp_getattro */
14535 0, /* tp_setattro */
14536 0, /* tp_as_buffer */
14537 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14538 0, /* tp_doc */
14539 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14540 0, /* tp_clear */
14541 0, /* tp_richcompare */
14542 0, /* tp_weaklistoffset */
14543 PyObject_SelfIter, /* tp_iter */
14544 (iternextfunc)unicodeiter_next, /* tp_iternext */
14545 unicodeiter_methods, /* tp_methods */
14546 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014547};
14548
14549static PyObject *
14550unicode_iter(PyObject *seq)
14551{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014552 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014553
Benjamin Peterson14339b62009-01-31 16:36:08 +000014554 if (!PyUnicode_Check(seq)) {
14555 PyErr_BadInternalCall();
14556 return NULL;
14557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014558 if (PyUnicode_READY(seq) == -1)
14559 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014560 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14561 if (it == NULL)
14562 return NULL;
14563 it->it_index = 0;
14564 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014565 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014566 _PyObject_GC_TRACK(it);
14567 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014568}
14569
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014570
14571size_t
14572Py_UNICODE_strlen(const Py_UNICODE *u)
14573{
14574 int res = 0;
14575 while(*u++)
14576 res++;
14577 return res;
14578}
14579
14580Py_UNICODE*
14581Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14582{
14583 Py_UNICODE *u = s1;
14584 while ((*u++ = *s2++));
14585 return s1;
14586}
14587
14588Py_UNICODE*
14589Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14590{
14591 Py_UNICODE *u = s1;
14592 while ((*u++ = *s2++))
14593 if (n-- == 0)
14594 break;
14595 return s1;
14596}
14597
14598Py_UNICODE*
14599Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14600{
14601 Py_UNICODE *u1 = s1;
14602 u1 += Py_UNICODE_strlen(u1);
14603 Py_UNICODE_strcpy(u1, s2);
14604 return s1;
14605}
14606
14607int
14608Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14609{
14610 while (*s1 && *s2 && *s1 == *s2)
14611 s1++, s2++;
14612 if (*s1 && *s2)
14613 return (*s1 < *s2) ? -1 : +1;
14614 if (*s1)
14615 return 1;
14616 if (*s2)
14617 return -1;
14618 return 0;
14619}
14620
14621int
14622Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14623{
14624 register Py_UNICODE u1, u2;
14625 for (; n != 0; n--) {
14626 u1 = *s1;
14627 u2 = *s2;
14628 if (u1 != u2)
14629 return (u1 < u2) ? -1 : +1;
14630 if (u1 == '\0')
14631 return 0;
14632 s1++;
14633 s2++;
14634 }
14635 return 0;
14636}
14637
14638Py_UNICODE*
14639Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14640{
14641 const Py_UNICODE *p;
14642 for (p = s; *p; p++)
14643 if (*p == c)
14644 return (Py_UNICODE*)p;
14645 return NULL;
14646}
14647
14648Py_UNICODE*
14649Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14650{
14651 const Py_UNICODE *p;
14652 p = s + Py_UNICODE_strlen(s);
14653 while (p != s) {
14654 p--;
14655 if (*p == c)
14656 return (Py_UNICODE*)p;
14657 }
14658 return NULL;
14659}
Victor Stinner331ea922010-08-10 16:37:20 +000014660
Victor Stinner71133ff2010-09-01 23:43:53 +000014661Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014662PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014663{
Victor Stinner577db2c2011-10-11 22:12:48 +020014664 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014665 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014667 if (!PyUnicode_Check(unicode)) {
14668 PyErr_BadArgument();
14669 return NULL;
14670 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014671 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014672 if (u == NULL)
14673 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014674 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014675 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014676 PyErr_NoMemory();
14677 return NULL;
14678 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014679 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014680 size *= sizeof(Py_UNICODE);
14681 copy = PyMem_Malloc(size);
14682 if (copy == NULL) {
14683 PyErr_NoMemory();
14684 return NULL;
14685 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014686 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014687 return copy;
14688}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014689
Georg Brandl66c221e2010-10-14 07:04:07 +000014690/* A _string module, to export formatter_parser and formatter_field_name_split
14691 to the string.Formatter class implemented in Python. */
14692
14693static PyMethodDef _string_methods[] = {
14694 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14695 METH_O, PyDoc_STR("split the argument as a field name")},
14696 {"formatter_parser", (PyCFunction) formatter_parser,
14697 METH_O, PyDoc_STR("parse the argument as a format string")},
14698 {NULL, NULL}
14699};
14700
14701static struct PyModuleDef _string_module = {
14702 PyModuleDef_HEAD_INIT,
14703 "_string",
14704 PyDoc_STR("string helper module"),
14705 0,
14706 _string_methods,
14707 NULL,
14708 NULL,
14709 NULL,
14710 NULL
14711};
14712
14713PyMODINIT_FUNC
14714PyInit__string(void)
14715{
14716 return PyModule_Create(&_string_module);
14717}
14718
14719
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014720#ifdef __cplusplus
14721}
14722#endif