blob: 84bbf9afff91667fa944e312cdf783e1ac334a30 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinnere6abb482012-05-02 01:15:40 +0200114/* Optimized version of Py_MAX() to compute the maximum character:
115 use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2) \
117 ((maxchar1) | (maxchar2))
118
Victor Stinner910337b2011-10-03 03:20:16 +0200119#undef PyUnicode_READY
120#define PyUnicode_READY(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200123 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100124 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200162 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
182static PyObject *interned;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200185static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000186
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* Single character Unicode strings in the Latin-1 range are being
191 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200192static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193
Christian Heimes190d79e2008-01-30 11:58:22 +0000194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000199/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000200/* case 0x000C: * FORM FEED */
201/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 1, 1, 1, 1, 1, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x001C: * FILE SEPARATOR */
205/* case 0x001D: * GROUP SEPARATOR */
206/* case 0x001E: * RECORD SEPARATOR */
207/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 1, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000214
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000223};
224
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200225/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200227static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100228static int unicode_modifiable(PyObject *unicode);
229
Victor Stinnerfe226c02011-10-03 03:52:20 +0200230
Alexander Belopolsky40018472011-02-26 01:02:56 +0000231static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100241 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
Alexander Belopolsky40018472011-02-26 01:02:56 +0000244static void
245raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300246 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100247 PyObject *unicode,
248 Py_ssize_t startpos, Py_ssize_t endpos,
249 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000250
Christian Heimes190d79e2008-01-30 11:58:22 +0000251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000255/* 0x000B, * LINE TABULATION */
256/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000257/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000258 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x001C, * FILE SEPARATOR */
261/* 0x001D, * GROUP SEPARATOR */
262/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 1, 1, 1, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000277};
278
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000282PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000283{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000284#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 /* This is actually an illegal character, so it should
288 not be passed to unichr. */
289 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290#endif
291}
292
Victor Stinner910337b2011-10-03 03:20:16 +0200293#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200294int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200296{
297 PyASCIIObject *ascii;
298 unsigned int kind;
299
300 assert(PyUnicode_Check(op));
301
302 ascii = (PyASCIIObject *)op;
303 kind = ascii->state.kind;
304
Victor Stinnera3b334d2011-10-03 13:53:37 +0200305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200306 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(ascii->state.ready == 1);
308 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200309 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200311 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200312
Victor Stinnera41463c2011-10-04 01:05:08 +0200313 if (ascii->state.compact == 1) {
314 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND
316 || kind == PyUnicode_2BYTE_KIND
317 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200320 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100321 }
322 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 assert(ascii->length == 0);
328 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert(ascii->state.compact == 0);
330 assert(ascii->state.ascii == 0);
331 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100332 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->wstr != NULL);
334 assert(data == NULL);
335 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 }
337 else {
338 assert(kind == PyUnicode_1BYTE_KIND
339 || kind == PyUnicode_2BYTE_KIND
340 || kind == PyUnicode_4BYTE_KIND);
341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ready == 1);
343 assert(data != NULL);
344 if (ascii->state.ascii) {
345 assert (compact->utf8 == data);
346 assert (compact->utf8_length == ascii->length);
347 }
348 else
349 assert (compact->utf8 != data);
350 }
351 }
352 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200353 if (
354#if SIZEOF_WCHAR_T == 2
355 kind == PyUnicode_2BYTE_KIND
356#else
357 kind == PyUnicode_4BYTE_KIND
358#endif
359 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200360 {
361 assert(ascii->wstr == data);
362 assert(compact->wstr_length == ascii->length);
363 } else
364 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366
367 if (compact->utf8 == NULL)
368 assert(compact->utf8_length == 0);
369 if (ascii->wstr == NULL)
370 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200372 /* check that the best kind is used */
373 if (check_content && kind != PyUnicode_WCHAR_KIND)
374 {
375 Py_ssize_t i;
376 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200377 void *data;
378 Py_UCS4 ch;
379
380 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 for (i=0; i < ascii->length; i++)
382 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200383 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 if (ch > maxchar)
385 maxchar = ch;
386 }
387 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100388 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200389 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100390 assert(maxchar <= 255);
391 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 else
393 assert(maxchar < 128);
394 }
Victor Stinner77faf692011-11-20 18:56:05 +0100395 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100397 assert(maxchar <= 0xFFFF);
398 }
399 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100401 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100402 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200403 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400405 return 1;
406}
Victor Stinner910337b2011-10-03 03:20:16 +0200407#endif
408
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100409static PyObject*
410unicode_result_wchar(PyObject *unicode)
411{
412#ifndef Py_DEBUG
413 Py_ssize_t len;
414
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100415 len = _PyUnicode_WSTR_LENGTH(unicode);
416 if (len == 0) {
417 Py_INCREF(unicode_empty);
418 Py_DECREF(unicode);
419 return unicode_empty;
420 }
421
422 if (len == 1) {
423 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
424 if (ch < 256) {
425 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
426 Py_DECREF(unicode);
427 return latin1_char;
428 }
429 }
430
431 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200432 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 return NULL;
434 }
435#else
Victor Stinneraa771272012-10-04 02:32:58 +0200436 assert(Py_REFCNT(unicode) == 1);
437
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100438 /* don't make the result ready in debug mode to ensure that the caller
439 makes the string ready before using it */
440 assert(_PyUnicode_CheckConsistency(unicode, 1));
441#endif
442 return unicode;
443}
444
445static PyObject*
446unicode_result_ready(PyObject *unicode)
447{
448 Py_ssize_t length;
449
450 length = PyUnicode_GET_LENGTH(unicode);
451 if (length == 0) {
452 if (unicode != unicode_empty) {
453 Py_INCREF(unicode_empty);
454 Py_DECREF(unicode);
455 }
456 return unicode_empty;
457 }
458
459 if (length == 1) {
460 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
461 if (ch < 256) {
462 PyObject *latin1_char = unicode_latin1[ch];
463 if (latin1_char != NULL) {
464 if (unicode != latin1_char) {
465 Py_INCREF(latin1_char);
466 Py_DECREF(unicode);
467 }
468 return latin1_char;
469 }
470 else {
471 assert(_PyUnicode_CheckConsistency(unicode, 1));
472 Py_INCREF(unicode);
473 unicode_latin1[ch] = unicode;
474 return unicode;
475 }
476 }
477 }
478
479 assert(_PyUnicode_CheckConsistency(unicode, 1));
480 return unicode;
481}
482
483static PyObject*
484unicode_result(PyObject *unicode)
485{
486 assert(_PyUnicode_CHECK(unicode));
487 if (PyUnicode_IS_READY(unicode))
488 return unicode_result_ready(unicode);
489 else
490 return unicode_result_wchar(unicode);
491}
492
Victor Stinnerc4b49542011-12-11 22:44:26 +0100493static PyObject*
494unicode_result_unchanged(PyObject *unicode)
495{
496 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500497 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100498 return NULL;
499 Py_INCREF(unicode);
500 return unicode;
501 }
502 else
503 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100504 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100505}
506
Victor Stinner3a50e702011-10-18 21:21:00 +0200507#ifdef HAVE_MBCS
508static OSVERSIONINFOEX winver;
509#endif
510
Thomas Wouters477c8d52006-05-27 19:21:47 +0000511/* --- Bloom Filters ----------------------------------------------------- */
512
513/* stuff to implement simple "bloom filters" for Unicode characters.
514 to keep things simple, we use a single bitmask, using the least 5
515 bits from each unicode characters as the bit index. */
516
517/* the linebreak mask is set up by Unicode_Init below */
518
Antoine Pitrouf068f942010-01-13 14:19:12 +0000519#if LONG_BIT >= 128
520#define BLOOM_WIDTH 128
521#elif LONG_BIT >= 64
522#define BLOOM_WIDTH 64
523#elif LONG_BIT >= 32
524#define BLOOM_WIDTH 32
525#else
526#error "LONG_BIT is smaller than 32"
527#endif
528
Thomas Wouters477c8d52006-05-27 19:21:47 +0000529#define BLOOM_MASK unsigned long
530
531static BLOOM_MASK bloom_linebreak;
532
Antoine Pitrouf068f942010-01-13 14:19:12 +0000533#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
534#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536#define BLOOM_LINEBREAK(ch) \
537 ((ch) < 128U ? ascii_linebreak[(ch)] : \
538 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539
Alexander Belopolsky40018472011-02-26 01:02:56 +0000540Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200541make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542{
543 /* calculate simple bloom-style bitmask for a given unicode string */
544
Antoine Pitrouf068f942010-01-13 14:19:12 +0000545 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546 Py_ssize_t i;
547
548 mask = 0;
549 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
552 return mask;
553}
554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555#define BLOOM_MEMBER(mask, chr, str) \
556 (BLOOM(mask, chr) \
557 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000558
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200559/* Compilation of templated routines */
560
561#include "stringlib/asciilib.h"
562#include "stringlib/fastsearch.h"
563#include "stringlib/partition.h"
564#include "stringlib/split.h"
565#include "stringlib/count.h"
566#include "stringlib/find.h"
567#include "stringlib/find_max_char.h"
568#include "stringlib/localeutil.h"
569#include "stringlib/undef.h"
570
571#include "stringlib/ucs1lib.h"
572#include "stringlib/fastsearch.h"
573#include "stringlib/partition.h"
574#include "stringlib/split.h"
575#include "stringlib/count.h"
576#include "stringlib/find.h"
577#include "stringlib/find_max_char.h"
578#include "stringlib/localeutil.h"
579#include "stringlib/undef.h"
580
581#include "stringlib/ucs2lib.h"
582#include "stringlib/fastsearch.h"
583#include "stringlib/partition.h"
584#include "stringlib/split.h"
585#include "stringlib/count.h"
586#include "stringlib/find.h"
587#include "stringlib/find_max_char.h"
588#include "stringlib/localeutil.h"
589#include "stringlib/undef.h"
590
591#include "stringlib/ucs4lib.h"
592#include "stringlib/fastsearch.h"
593#include "stringlib/partition.h"
594#include "stringlib/split.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
597#include "stringlib/find_max_char.h"
598#include "stringlib/localeutil.h"
599#include "stringlib/undef.h"
600
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200601#include "stringlib/unicodedefs.h"
602#include "stringlib/fastsearch.h"
603#include "stringlib/count.h"
604#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100605#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200606
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607/* --- Unicode Object ----------------------------------------------------- */
608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200609static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200610fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
613 Py_ssize_t size, Py_UCS4 ch,
614 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200616 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
617
618 switch (kind) {
619 case PyUnicode_1BYTE_KIND:
620 {
621 Py_UCS1 ch1 = (Py_UCS1) ch;
622 if (ch1 == ch)
623 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
624 else
625 return -1;
626 }
627 case PyUnicode_2BYTE_KIND:
628 {
629 Py_UCS2 ch2 = (Py_UCS2) ch;
630 if (ch2 == ch)
631 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
632 else
633 return -1;
634 }
635 case PyUnicode_4BYTE_KIND:
636 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
637 default:
638 assert(0);
639 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641}
642
Victor Stinnerafffce42012-10-03 23:03:17 +0200643#ifdef Py_DEBUG
644/* Fill the data of an Unicode string with invalid characters to detect bugs
645 earlier.
646
647 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
648 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
649 invalid character in Unicode 6.0. */
650static void
651unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
652{
653 int kind = PyUnicode_KIND(unicode);
654 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
655 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
656 if (length <= old_length)
657 return;
658 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
659}
660#endif
661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662static PyObject*
663resize_compact(PyObject *unicode, Py_ssize_t length)
664{
665 Py_ssize_t char_size;
666 Py_ssize_t struct_size;
667 Py_ssize_t new_size;
668 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100669 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200670#ifdef Py_DEBUG
671 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
672#endif
673
Victor Stinner79891572012-05-03 13:43:07 +0200674 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200675 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100676 assert(PyUnicode_IS_COMPACT(unicode));
677
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200678 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100679 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680 struct_size = sizeof(PyASCIIObject);
681 else
682 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200683 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684
Victor Stinnerfe226c02011-10-03 03:52:20 +0200685 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
686 PyErr_NoMemory();
687 return NULL;
688 }
689 new_size = (struct_size + (length + 1) * char_size);
690
Victor Stinner84def372011-12-11 20:04:56 +0100691 _Py_DEC_REFTOTAL;
692 _Py_ForgetReference(unicode);
693
694 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
695 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100696 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 PyErr_NoMemory();
698 return NULL;
699 }
Victor Stinner84def372011-12-11 20:04:56 +0100700 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200701 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100702
Victor Stinnerfe226c02011-10-03 03:52:20 +0200703 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200704 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200705 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100706 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200707 _PyUnicode_WSTR_LENGTH(unicode) = length;
708 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200709#ifdef Py_DEBUG
710 unicode_fill_invalid(unicode, old_length);
711#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200712 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
713 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200714 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200715 return unicode;
716}
717
Alexander Belopolsky40018472011-02-26 01:02:56 +0000718static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200719resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000720{
Victor Stinner95663112011-10-04 01:03:50 +0200721 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100722 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200723 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200724 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000725
Victor Stinnerfe226c02011-10-03 03:52:20 +0200726 if (PyUnicode_IS_READY(unicode)) {
727 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200728 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200729 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200730#ifdef Py_DEBUG
731 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
732#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200733
734 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200735 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200736 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
737 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200738
739 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
740 PyErr_NoMemory();
741 return -1;
742 }
743 new_size = (length + 1) * char_size;
744
Victor Stinner7a9105a2011-12-12 00:13:42 +0100745 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
746 {
747 PyObject_DEL(_PyUnicode_UTF8(unicode));
748 _PyUnicode_UTF8(unicode) = NULL;
749 _PyUnicode_UTF8_LENGTH(unicode) = 0;
750 }
751
Victor Stinnerfe226c02011-10-03 03:52:20 +0200752 data = (PyObject *)PyObject_REALLOC(data, new_size);
753 if (data == NULL) {
754 PyErr_NoMemory();
755 return -1;
756 }
757 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200758 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200759 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200760 _PyUnicode_WSTR_LENGTH(unicode) = length;
761 }
762 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200763 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200764 _PyUnicode_UTF8_LENGTH(unicode) = length;
765 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 _PyUnicode_LENGTH(unicode) = length;
767 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200768#ifdef Py_DEBUG
769 unicode_fill_invalid(unicode, old_length);
770#endif
Victor Stinner95663112011-10-04 01:03:50 +0200771 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200772 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200775 }
Victor Stinner95663112011-10-04 01:03:50 +0200776 assert(_PyUnicode_WSTR(unicode) != NULL);
777
778 /* check for integer overflow */
779 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
780 PyErr_NoMemory();
781 return -1;
782 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100783 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200784 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200786 if (!wstr) {
787 PyErr_NoMemory();
788 return -1;
789 }
790 _PyUnicode_WSTR(unicode) = wstr;
791 _PyUnicode_WSTR(unicode)[length] = 0;
792 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200793 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000794 return 0;
795}
796
Victor Stinnerfe226c02011-10-03 03:52:20 +0200797static PyObject*
798resize_copy(PyObject *unicode, Py_ssize_t length)
799{
800 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100801 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200802 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100803
Benjamin Petersonbac79492012-01-14 13:34:47 -0500804 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100805 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200806
807 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
808 if (copy == NULL)
809 return NULL;
810
811 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200812 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200814 }
815 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200816 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100817
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200818 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200819 if (w == NULL)
820 return NULL;
821 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
822 copy_length = Py_MIN(copy_length, length);
823 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
824 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200825 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200826 }
827}
828
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000830 Ux0000 terminated; some code (e.g. new_identifier)
831 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000832
833 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000834 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000835
836*/
837
Alexander Belopolsky40018472011-02-26 01:02:56 +0000838static PyUnicodeObject *
839_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000840{
841 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000843
Thomas Wouters477c8d52006-05-27 19:21:47 +0000844 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000845 if (length == 0 && unicode_empty != NULL) {
846 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200847 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000848 }
849
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000850 /* Ensure we won't overflow the size. */
851 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
852 return (PyUnicodeObject *)PyErr_NoMemory();
853 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200854 if (length < 0) {
855 PyErr_SetString(PyExc_SystemError,
856 "Negative size passed to _PyUnicode_New");
857 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000858 }
859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200860 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
861 if (unicode == NULL)
862 return NULL;
863 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
864 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
865 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100866 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000867 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100868 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000869 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200870
Jeremy Hyltond8082792003-09-16 19:41:39 +0000871 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000872 * the caller fails before initializing str -- unicode_resize()
873 * reads str[0], and the Keep-Alive optimization can keep memory
874 * allocated for str alive across a call to unicode_dealloc(unicode).
875 * We don't want unicode_resize to read uninitialized memory in
876 * that case.
877 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200878 _PyUnicode_WSTR(unicode)[0] = 0;
879 _PyUnicode_WSTR(unicode)[length] = 0;
880 _PyUnicode_WSTR_LENGTH(unicode) = length;
881 _PyUnicode_HASH(unicode) = -1;
882 _PyUnicode_STATE(unicode).interned = 0;
883 _PyUnicode_STATE(unicode).kind = 0;
884 _PyUnicode_STATE(unicode).compact = 0;
885 _PyUnicode_STATE(unicode).ready = 0;
886 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200887 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200888 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200889 _PyUnicode_UTF8(unicode) = NULL;
890 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100891 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000892 return unicode;
893}
894
Victor Stinnerf42dc442011-10-02 23:33:16 +0200895static const char*
896unicode_kind_name(PyObject *unicode)
897{
Victor Stinner42dfd712011-10-03 14:41:45 +0200898 /* don't check consistency: unicode_kind_name() is called from
899 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200900 if (!PyUnicode_IS_COMPACT(unicode))
901 {
902 if (!PyUnicode_IS_READY(unicode))
903 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600904 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200905 {
906 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200907 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200908 return "legacy ascii";
909 else
910 return "legacy latin1";
911 case PyUnicode_2BYTE_KIND:
912 return "legacy UCS2";
913 case PyUnicode_4BYTE_KIND:
914 return "legacy UCS4";
915 default:
916 return "<legacy invalid kind>";
917 }
918 }
919 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600920 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200921 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200922 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200923 return "ascii";
924 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200925 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200926 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200927 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200928 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200929 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200930 default:
931 return "<invalid compact kind>";
932 }
933}
934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200935#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200936/* Functions wrapping macros for use in debugger */
937char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200938 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200939}
940
941void *_PyUnicode_compact_data(void *unicode) {
942 return _PyUnicode_COMPACT_DATA(unicode);
943}
944void *_PyUnicode_data(void *unicode){
945 printf("obj %p\n", unicode);
946 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
947 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
948 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
949 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
950 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
951 return PyUnicode_DATA(unicode);
952}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200953
954void
955_PyUnicode_Dump(PyObject *op)
956{
957 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200958 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
959 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
960 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200961
Victor Stinnera849a4b2011-10-03 12:12:11 +0200962 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200963 {
964 if (ascii->state.ascii)
965 data = (ascii + 1);
966 else
967 data = (compact + 1);
968 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200969 else
970 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200971 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
972
Victor Stinnera849a4b2011-10-03 12:12:11 +0200973 if (ascii->wstr == data)
974 printf("shared ");
975 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200976
Victor Stinnera3b334d2011-10-03 13:53:37 +0200977 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200978 printf(" (%zu), ", compact->wstr_length);
979 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
980 printf("shared ");
981 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200982 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200983 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200984}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985#endif
986
987PyObject *
988PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
989{
990 PyObject *obj;
991 PyCompactUnicodeObject *unicode;
992 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200993 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200994 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995 Py_ssize_t char_size;
996 Py_ssize_t struct_size;
997
998 /* Optimization for empty strings */
999 if (size == 0 && unicode_empty != NULL) {
1000 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001001 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001002 }
1003
Victor Stinner9e9d6892011-10-04 01:02:02 +02001004 is_ascii = 0;
1005 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001006 struct_size = sizeof(PyCompactUnicodeObject);
1007 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001008 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001009 char_size = 1;
1010 is_ascii = 1;
1011 struct_size = sizeof(PyASCIIObject);
1012 }
1013 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001014 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001015 char_size = 1;
1016 }
1017 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001018 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001019 char_size = 2;
1020 if (sizeof(wchar_t) == 2)
1021 is_sharing = 1;
1022 }
1023 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001024 if (maxchar > MAX_UNICODE) {
1025 PyErr_SetString(PyExc_SystemError,
1026 "invalid maximum character passed to PyUnicode_New");
1027 return NULL;
1028 }
Victor Stinner8f825062012-04-27 13:55:39 +02001029 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030 char_size = 4;
1031 if (sizeof(wchar_t) == 4)
1032 is_sharing = 1;
1033 }
1034
1035 /* Ensure we won't overflow the size. */
1036 if (size < 0) {
1037 PyErr_SetString(PyExc_SystemError,
1038 "Negative size passed to PyUnicode_New");
1039 return NULL;
1040 }
1041 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1042 return PyErr_NoMemory();
1043
1044 /* Duplicated allocation code from _PyObject_New() instead of a call to
1045 * PyObject_New() so we are able to allocate space for the object and
1046 * it's data buffer.
1047 */
1048 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1049 if (obj == NULL)
1050 return PyErr_NoMemory();
1051 obj = PyObject_INIT(obj, &PyUnicode_Type);
1052 if (obj == NULL)
1053 return NULL;
1054
1055 unicode = (PyCompactUnicodeObject *)obj;
1056 if (is_ascii)
1057 data = ((PyASCIIObject*)obj) + 1;
1058 else
1059 data = unicode + 1;
1060 _PyUnicode_LENGTH(unicode) = size;
1061 _PyUnicode_HASH(unicode) = -1;
1062 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001063 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 _PyUnicode_STATE(unicode).compact = 1;
1065 _PyUnicode_STATE(unicode).ready = 1;
1066 _PyUnicode_STATE(unicode).ascii = is_ascii;
1067 if (is_ascii) {
1068 ((char*)data)[size] = 0;
1069 _PyUnicode_WSTR(unicode) = NULL;
1070 }
Victor Stinner8f825062012-04-27 13:55:39 +02001071 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072 ((char*)data)[size] = 0;
1073 _PyUnicode_WSTR(unicode) = NULL;
1074 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001076 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078 else {
1079 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001080 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001081 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001083 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001084 ((Py_UCS4*)data)[size] = 0;
1085 if (is_sharing) {
1086 _PyUnicode_WSTR_LENGTH(unicode) = size;
1087 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1088 }
1089 else {
1090 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1091 _PyUnicode_WSTR(unicode) = NULL;
1092 }
1093 }
Victor Stinner8f825062012-04-27 13:55:39 +02001094#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001095 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001096#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001097 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001098 return obj;
1099}
1100
1101#if SIZEOF_WCHAR_T == 2
1102/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1103 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001104 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001105
1106 This function assumes that unicode can hold one more code point than wstr
1107 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001108static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001110 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001111{
1112 const wchar_t *iter;
1113 Py_UCS4 *ucs4_out;
1114
Victor Stinner910337b2011-10-03 03:20:16 +02001115 assert(unicode != NULL);
1116 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1118 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1119
1120 for (iter = begin; iter < end; ) {
1121 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1122 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001123 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1124 && (iter+1) < end
1125 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126 {
Victor Stinner551ac952011-11-29 22:58:13 +01001127 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128 iter += 2;
1129 }
1130 else {
1131 *ucs4_out++ = *iter;
1132 iter++;
1133 }
1134 }
1135 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1136 _PyUnicode_GET_LENGTH(unicode)));
1137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138}
1139#endif
1140
Victor Stinnercd9950f2011-10-02 00:34:53 +02001141static int
Victor Stinner488fa492011-12-12 00:01:39 +01001142unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001143{
Victor Stinner488fa492011-12-12 00:01:39 +01001144 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001145 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001146 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001147 return -1;
1148 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001149 return 0;
1150}
1151
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152static int
1153_copy_characters(PyObject *to, Py_ssize_t to_start,
1154 PyObject *from, Py_ssize_t from_start,
1155 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001157 unsigned int from_kind, to_kind;
1158 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159
Victor Stinneree4544c2012-05-09 22:24:08 +02001160 assert(0 <= how_many);
1161 assert(0 <= from_start);
1162 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001163 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001164 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001165 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001166
Victor Stinnerd3f08822012-05-29 12:57:52 +02001167 assert(PyUnicode_Check(to));
1168 assert(PyUnicode_IS_READY(to));
1169 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1170
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001171 if (how_many == 0)
1172 return 0;
1173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001174 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001175 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001177 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178
Victor Stinnerf1852262012-06-16 16:38:26 +02001179#ifdef Py_DEBUG
1180 if (!check_maxchar
1181 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1182 {
1183 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1184 Py_UCS4 ch;
1185 Py_ssize_t i;
1186 for (i=0; i < how_many; i++) {
1187 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1188 assert(ch <= to_maxchar);
1189 }
1190 }
1191#endif
1192
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001193 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001194 if (check_maxchar
1195 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1196 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001197 /* Writing Latin-1 characters into an ASCII string requires to
1198 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001199 Py_UCS4 max_char;
1200 max_char = ucs1lib_find_max_char(from_data,
1201 (Py_UCS1*)from_data + how_many);
1202 if (max_char >= 128)
1203 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001204 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001205 Py_MEMCPY((char*)to_data + to_kind * to_start,
1206 (char*)from_data + from_kind * from_start,
1207 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001209 else if (from_kind == PyUnicode_1BYTE_KIND
1210 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001211 {
1212 _PyUnicode_CONVERT_BYTES(
1213 Py_UCS1, Py_UCS2,
1214 PyUnicode_1BYTE_DATA(from) + from_start,
1215 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1216 PyUnicode_2BYTE_DATA(to) + to_start
1217 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001218 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001219 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001220 && to_kind == PyUnicode_4BYTE_KIND)
1221 {
1222 _PyUnicode_CONVERT_BYTES(
1223 Py_UCS1, Py_UCS4,
1224 PyUnicode_1BYTE_DATA(from) + from_start,
1225 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1226 PyUnicode_4BYTE_DATA(to) + to_start
1227 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001228 }
1229 else if (from_kind == PyUnicode_2BYTE_KIND
1230 && to_kind == PyUnicode_4BYTE_KIND)
1231 {
1232 _PyUnicode_CONVERT_BYTES(
1233 Py_UCS2, Py_UCS4,
1234 PyUnicode_2BYTE_DATA(from) + from_start,
1235 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1236 PyUnicode_4BYTE_DATA(to) + to_start
1237 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001238 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001239 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001240 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1241
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001242 if (!check_maxchar) {
1243 if (from_kind == PyUnicode_2BYTE_KIND
1244 && to_kind == PyUnicode_1BYTE_KIND)
1245 {
1246 _PyUnicode_CONVERT_BYTES(
1247 Py_UCS2, Py_UCS1,
1248 PyUnicode_2BYTE_DATA(from) + from_start,
1249 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1250 PyUnicode_1BYTE_DATA(to) + to_start
1251 );
1252 }
1253 else if (from_kind == PyUnicode_4BYTE_KIND
1254 && to_kind == PyUnicode_1BYTE_KIND)
1255 {
1256 _PyUnicode_CONVERT_BYTES(
1257 Py_UCS4, Py_UCS1,
1258 PyUnicode_4BYTE_DATA(from) + from_start,
1259 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1260 PyUnicode_1BYTE_DATA(to) + to_start
1261 );
1262 }
1263 else if (from_kind == PyUnicode_4BYTE_KIND
1264 && to_kind == PyUnicode_2BYTE_KIND)
1265 {
1266 _PyUnicode_CONVERT_BYTES(
1267 Py_UCS4, Py_UCS2,
1268 PyUnicode_4BYTE_DATA(from) + from_start,
1269 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1270 PyUnicode_2BYTE_DATA(to) + to_start
1271 );
1272 }
1273 else {
1274 assert(0);
1275 return -1;
1276 }
1277 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001278 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001279 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001280 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001281 Py_ssize_t i;
1282
Victor Stinnera0702ab2011-09-29 14:14:38 +02001283 for (i=0; i < how_many; i++) {
1284 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 if (ch > to_maxchar)
1286 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001287 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1288 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001289 }
1290 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001291 return 0;
1292}
1293
Victor Stinnerd3f08822012-05-29 12:57:52 +02001294void
1295_PyUnicode_FastCopyCharacters(
1296 PyObject *to, Py_ssize_t to_start,
1297 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001298{
1299 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1300}
1301
1302Py_ssize_t
1303PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1304 PyObject *from, Py_ssize_t from_start,
1305 Py_ssize_t how_many)
1306{
1307 int err;
1308
1309 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1310 PyErr_BadInternalCall();
1311 return -1;
1312 }
1313
Benjamin Petersonbac79492012-01-14 13:34:47 -05001314 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001315 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001316 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001317 return -1;
1318
Victor Stinnerd3f08822012-05-29 12:57:52 +02001319 if (from_start < 0) {
1320 PyErr_SetString(PyExc_IndexError, "string index out of range");
1321 return -1;
1322 }
1323 if (to_start < 0) {
1324 PyErr_SetString(PyExc_IndexError, "string index out of range");
1325 return -1;
1326 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001327 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1328 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1329 PyErr_Format(PyExc_SystemError,
1330 "Cannot write %zi characters at %zi "
1331 "in a string of %zi characters",
1332 how_many, to_start, PyUnicode_GET_LENGTH(to));
1333 return -1;
1334 }
1335
1336 if (how_many == 0)
1337 return 0;
1338
Victor Stinner488fa492011-12-12 00:01:39 +01001339 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001340 return -1;
1341
1342 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1343 if (err) {
1344 PyErr_Format(PyExc_SystemError,
1345 "Cannot copy %s characters "
1346 "into a string of %s characters",
1347 unicode_kind_name(from),
1348 unicode_kind_name(to));
1349 return -1;
1350 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001351 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001352}
1353
Victor Stinner17222162011-09-28 22:15:37 +02001354/* Find the maximum code point and count the number of surrogate pairs so a
1355 correct string length can be computed before converting a string to UCS4.
1356 This function counts single surrogates as a character and not as a pair.
1357
1358 Return 0 on success, or -1 on error. */
1359static int
1360find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1361 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362{
1363 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001364 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365
Victor Stinnerc53be962011-10-02 21:33:54 +02001366 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 *num_surrogates = 0;
1368 *maxchar = 0;
1369
1370 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001372 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1373 && (iter+1) < end
1374 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001376 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001378 iter += 2;
1379 }
1380 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001382 {
1383 ch = *iter;
1384 iter++;
1385 }
1386 if (ch > *maxchar) {
1387 *maxchar = ch;
1388 if (*maxchar > MAX_UNICODE) {
1389 PyErr_Format(PyExc_ValueError,
1390 "character U+%x is not in range [U+0000; U+10ffff]",
1391 ch);
1392 return -1;
1393 }
1394 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 }
1396 return 0;
1397}
1398
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001399int
1400_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401{
1402 wchar_t *end;
1403 Py_UCS4 maxchar = 0;
1404 Py_ssize_t num_surrogates;
1405#if SIZEOF_WCHAR_T == 2
1406 Py_ssize_t length_wo_surrogates;
1407#endif
1408
Georg Brandl7597add2011-10-05 16:36:47 +02001409 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001410 strings were created using _PyObject_New() and where no canonical
1411 representation (the str field) has been set yet aka strings
1412 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001413 assert(_PyUnicode_CHECK(unicode));
1414 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001416 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001417 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001418 /* Actually, it should neither be interned nor be anything else: */
1419 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001422 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001423 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425
1426 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001427 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1428 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429 PyErr_NoMemory();
1430 return -1;
1431 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001432 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 _PyUnicode_WSTR(unicode), end,
1434 PyUnicode_1BYTE_DATA(unicode));
1435 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1436 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1437 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1438 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001439 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001440 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001441 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 }
1443 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001444 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001445 _PyUnicode_UTF8(unicode) = NULL;
1446 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447 }
1448 PyObject_FREE(_PyUnicode_WSTR(unicode));
1449 _PyUnicode_WSTR(unicode) = NULL;
1450 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1451 }
1452 /* In this case we might have to convert down from 4-byte native
1453 wchar_t to 2-byte unicode. */
1454 else if (maxchar < 65536) {
1455 assert(num_surrogates == 0 &&
1456 "FindMaxCharAndNumSurrogatePairs() messed up");
1457
Victor Stinner506f5922011-09-28 22:34:18 +02001458#if SIZEOF_WCHAR_T == 2
1459 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001460 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001461 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1462 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1463 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001464 _PyUnicode_UTF8(unicode) = NULL;
1465 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001466#else
1467 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001468 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001469 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001471 PyErr_NoMemory();
1472 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 }
Victor Stinner506f5922011-09-28 22:34:18 +02001474 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1475 _PyUnicode_WSTR(unicode), end,
1476 PyUnicode_2BYTE_DATA(unicode));
1477 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1478 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1479 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001480 _PyUnicode_UTF8(unicode) = NULL;
1481 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001482 PyObject_FREE(_PyUnicode_WSTR(unicode));
1483 _PyUnicode_WSTR(unicode) = NULL;
1484 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1485#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486 }
1487 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1488 else {
1489#if SIZEOF_WCHAR_T == 2
1490 /* in case the native representation is 2-bytes, we need to allocate a
1491 new normalized 4-byte version. */
1492 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001493 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1494 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495 PyErr_NoMemory();
1496 return -1;
1497 }
1498 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1499 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001500 _PyUnicode_UTF8(unicode) = NULL;
1501 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001502 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1503 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001504 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001505 PyObject_FREE(_PyUnicode_WSTR(unicode));
1506 _PyUnicode_WSTR(unicode) = NULL;
1507 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1508#else
1509 assert(num_surrogates == 0);
1510
Victor Stinnerc3c74152011-10-02 20:39:55 +02001511 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001512 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001513 _PyUnicode_UTF8(unicode) = NULL;
1514 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1516#endif
1517 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1518 }
1519 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001520 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521 return 0;
1522}
1523
Alexander Belopolsky40018472011-02-26 01:02:56 +00001524static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001525unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526{
Walter Dörwald16807132007-05-25 13:52:07 +00001527 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001528 case SSTATE_NOT_INTERNED:
1529 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001530
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 case SSTATE_INTERNED_MORTAL:
1532 /* revive dead object temporarily for DelItem */
1533 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001534 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001535 Py_FatalError(
1536 "deletion of interned string failed");
1537 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001538
Benjamin Peterson29060642009-01-31 22:14:21 +00001539 case SSTATE_INTERNED_IMMORTAL:
1540 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001541
Benjamin Peterson29060642009-01-31 22:14:21 +00001542 default:
1543 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001544 }
1545
Victor Stinner03490912011-10-03 23:45:12 +02001546 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001548 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001549 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001550 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1551 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001552
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001553 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554}
1555
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001556#ifdef Py_DEBUG
1557static int
1558unicode_is_singleton(PyObject *unicode)
1559{
1560 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1561 if (unicode == unicode_empty)
1562 return 1;
1563 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1564 {
1565 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1566 if (ch < 256 && unicode_latin1[ch] == unicode)
1567 return 1;
1568 }
1569 return 0;
1570}
1571#endif
1572
Alexander Belopolsky40018472011-02-26 01:02:56 +00001573static int
Victor Stinner488fa492011-12-12 00:01:39 +01001574unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001575{
Victor Stinner488fa492011-12-12 00:01:39 +01001576 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001577 if (Py_REFCNT(unicode) != 1)
1578 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001579 if (_PyUnicode_HASH(unicode) != -1)
1580 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001581 if (PyUnicode_CHECK_INTERNED(unicode))
1582 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001583 if (!PyUnicode_CheckExact(unicode))
1584 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001585#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001586 /* singleton refcount is greater than 1 */
1587 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001588#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001589 return 1;
1590}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001591
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592static int
1593unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1594{
1595 PyObject *unicode;
1596 Py_ssize_t old_length;
1597
1598 assert(p_unicode != NULL);
1599 unicode = *p_unicode;
1600
1601 assert(unicode != NULL);
1602 assert(PyUnicode_Check(unicode));
1603 assert(0 <= length);
1604
Victor Stinner910337b2011-10-03 03:20:16 +02001605 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001606 old_length = PyUnicode_WSTR_LENGTH(unicode);
1607 else
1608 old_length = PyUnicode_GET_LENGTH(unicode);
1609 if (old_length == length)
1610 return 0;
1611
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001612 if (length == 0) {
1613 Py_DECREF(*p_unicode);
1614 *p_unicode = unicode_empty;
1615 Py_INCREF(*p_unicode);
1616 return 0;
1617 }
1618
Victor Stinner488fa492011-12-12 00:01:39 +01001619 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 PyObject *copy = resize_copy(unicode, length);
1621 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001622 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001623 Py_DECREF(*p_unicode);
1624 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001625 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001626 }
1627
Victor Stinnerfe226c02011-10-03 03:52:20 +02001628 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001629 PyObject *new_unicode = resize_compact(unicode, length);
1630 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001631 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001632 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001633 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001634 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001635 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001636}
1637
Alexander Belopolsky40018472011-02-26 01:02:56 +00001638int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001639PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001640{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001641 PyObject *unicode;
1642 if (p_unicode == NULL) {
1643 PyErr_BadInternalCall();
1644 return -1;
1645 }
1646 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001647 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001648 {
1649 PyErr_BadInternalCall();
1650 return -1;
1651 }
1652 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001653}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001654
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001656unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1657 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001658{
1659 PyObject *result;
1660 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001661 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001662 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1663 return 0;
1664 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1665 maxchar);
1666 if (result == NULL)
1667 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001668 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001669 Py_DECREF(*p_unicode);
1670 *p_unicode = result;
1671 return 0;
1672}
1673
1674static int
1675unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1676 Py_UCS4 ch)
1677{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001678 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001679 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001680 return -1;
1681 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1682 PyUnicode_DATA(*p_unicode),
1683 (*pos)++, ch);
1684 return 0;
1685}
1686
Victor Stinnerc5166102012-02-22 13:55:02 +01001687/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001688
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001689 WARNING: The function doesn't copy the terminating null character and
1690 doesn't check the maximum character (may write a latin1 character in an
1691 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001692static void
1693unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1694 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001695{
1696 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1697 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001698 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001699
1700 switch (kind) {
1701 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001702 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001703#ifdef Py_DEBUG
1704 if (PyUnicode_IS_ASCII(unicode)) {
1705 Py_UCS4 maxchar = ucs1lib_find_max_char(
1706 (const Py_UCS1*)str,
1707 (const Py_UCS1*)str + len);
1708 assert(maxchar < 128);
1709 }
1710#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001711 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001712 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001713 }
1714 case PyUnicode_2BYTE_KIND: {
1715 Py_UCS2 *start = (Py_UCS2 *)data + index;
1716 Py_UCS2 *ucs2 = start;
1717 assert(index <= PyUnicode_GET_LENGTH(unicode));
1718
Victor Stinner184252a2012-06-16 02:57:41 +02001719 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001720 *ucs2 = (Py_UCS2)*str;
1721
1722 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001723 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001724 }
1725 default: {
1726 Py_UCS4 *start = (Py_UCS4 *)data + index;
1727 Py_UCS4 *ucs4 = start;
1728 assert(kind == PyUnicode_4BYTE_KIND);
1729 assert(index <= PyUnicode_GET_LENGTH(unicode));
1730
Victor Stinner184252a2012-06-16 02:57:41 +02001731 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001732 *ucs4 = (Py_UCS4)*str;
1733
1734 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001735 }
1736 }
1737}
1738
1739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740static PyObject*
1741get_latin1_char(unsigned char ch)
1742{
Victor Stinnera464fc12011-10-02 20:39:30 +02001743 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001745 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746 if (!unicode)
1747 return NULL;
1748 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001749 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750 unicode_latin1[ch] = unicode;
1751 }
1752 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001753 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754}
1755
Alexander Belopolsky40018472011-02-26 01:02:56 +00001756PyObject *
1757PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001758{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001759 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 Py_UCS4 maxchar = 0;
1761 Py_ssize_t num_surrogates;
1762
1763 if (u == NULL)
1764 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001766 /* If the Unicode data is known at construction time, we can apply
1767 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 /* Optimization for empty strings */
1770 if (size == 0 && unicode_empty != NULL) {
1771 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001772 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001773 }
Tim Petersced69f82003-09-16 20:30:58 +00001774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 /* Single character Unicode objects in the Latin-1 range are
1776 shared when using this constructor */
1777 if (size == 1 && *u < 256)
1778 return get_latin1_char((unsigned char)*u);
1779
1780 /* If not empty and not single character, copy the Unicode data
1781 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001782 if (find_maxchar_surrogates(u, u + size,
1783 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 return NULL;
1785
Victor Stinner8faf8212011-12-08 22:14:11 +01001786 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001787 if (!unicode)
1788 return NULL;
1789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 switch (PyUnicode_KIND(unicode)) {
1791 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001792 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1794 break;
1795 case PyUnicode_2BYTE_KIND:
1796#if Py_UNICODE_SIZE == 2
1797 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1798#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001799 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1801#endif
1802 break;
1803 case PyUnicode_4BYTE_KIND:
1804#if SIZEOF_WCHAR_T == 2
1805 /* This is the only case which has to process surrogates, thus
1806 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001807 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001808#else
1809 assert(num_surrogates == 0);
1810 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1811#endif
1812 break;
1813 default:
1814 assert(0 && "Impossible state");
1815 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001817 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001818}
1819
Alexander Belopolsky40018472011-02-26 01:02:56 +00001820PyObject *
1821PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001822{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001823 if (size < 0) {
1824 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001825 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001826 return NULL;
1827 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001828 if (u != NULL)
1829 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1830 else
1831 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001832}
1833
Alexander Belopolsky40018472011-02-26 01:02:56 +00001834PyObject *
1835PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001836{
1837 size_t size = strlen(u);
1838 if (size > PY_SSIZE_T_MAX) {
1839 PyErr_SetString(PyExc_OverflowError, "input too long");
1840 return NULL;
1841 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001842 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001843}
1844
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001845PyObject *
1846_PyUnicode_FromId(_Py_Identifier *id)
1847{
1848 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001849 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1850 strlen(id->string),
1851 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001852 if (!id->object)
1853 return NULL;
1854 PyUnicode_InternInPlace(&id->object);
1855 assert(!id->next);
1856 id->next = static_strings;
1857 static_strings = id;
1858 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001859 return id->object;
1860}
1861
1862void
1863_PyUnicode_ClearStaticStrings()
1864{
1865 _Py_Identifier *i;
1866 for (i = static_strings; i; i = i->next) {
1867 Py_DECREF(i->object);
1868 i->object = NULL;
1869 i->next = NULL;
1870 }
1871}
1872
Benjamin Peterson0df54292012-03-26 14:50:32 -04001873/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001874
Victor Stinnerd3f08822012-05-29 12:57:52 +02001875PyObject*
1876_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001877{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001878 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001879 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001880 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001881#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001882 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001883#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001884 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001885 }
Victor Stinner785938e2011-12-11 20:09:03 +01001886 unicode = PyUnicode_New(size, 127);
1887 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001888 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001889 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1890 assert(_PyUnicode_CheckConsistency(unicode, 1));
1891 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001892}
1893
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001894static Py_UCS4
1895kind_maxchar_limit(unsigned int kind)
1896{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001897 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001898 case PyUnicode_1BYTE_KIND:
1899 return 0x80;
1900 case PyUnicode_2BYTE_KIND:
1901 return 0x100;
1902 case PyUnicode_4BYTE_KIND:
1903 return 0x10000;
1904 default:
1905 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001906 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001907 }
1908}
1909
Victor Stinnere6abb482012-05-02 01:15:40 +02001910Py_LOCAL_INLINE(Py_UCS4)
1911align_maxchar(Py_UCS4 maxchar)
1912{
1913 if (maxchar <= 127)
1914 return 127;
1915 else if (maxchar <= 255)
1916 return 255;
1917 else if (maxchar <= 65535)
1918 return 65535;
1919 else
1920 return MAX_UNICODE;
1921}
1922
Victor Stinner702c7342011-10-05 13:50:52 +02001923static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001924_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001925{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001927 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001928
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001929 if (size == 0) {
1930 Py_INCREF(unicode_empty);
1931 return unicode_empty;
1932 }
1933 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001934 if (size == 1)
1935 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001936
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001937 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001938 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 if (!res)
1940 return NULL;
1941 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001942 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001943 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001944}
1945
Victor Stinnere57b1c02011-09-28 22:20:48 +02001946static PyObject*
1947_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001948{
1949 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001950 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001951
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001952 if (size == 0) {
1953 Py_INCREF(unicode_empty);
1954 return unicode_empty;
1955 }
1956 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001957 if (size == 1) {
1958 Py_UCS4 ch = u[0];
1959 if (ch < 256)
1960 return get_latin1_char((unsigned char)ch);
1961
1962 res = PyUnicode_New(1, ch);
1963 if (res == NULL)
1964 return NULL;
1965 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1966 assert(_PyUnicode_CheckConsistency(res, 1));
1967 return res;
1968 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001969
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001970 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001971 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001972 if (!res)
1973 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001974 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001975 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001976 else {
1977 _PyUnicode_CONVERT_BYTES(
1978 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1979 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001980 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 return res;
1982}
1983
Victor Stinnere57b1c02011-09-28 22:20:48 +02001984static PyObject*
1985_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001986{
1987 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001988 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001989
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001990 if (size == 0) {
1991 Py_INCREF(unicode_empty);
1992 return unicode_empty;
1993 }
1994 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001995 if (size == 1) {
1996 Py_UCS4 ch = u[0];
1997 if (ch < 256)
1998 return get_latin1_char((unsigned char)ch);
1999
2000 res = PyUnicode_New(1, ch);
2001 if (res == NULL)
2002 return NULL;
2003 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
2004 assert(_PyUnicode_CheckConsistency(res, 1));
2005 return res;
2006 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002007
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002008 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002009 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 if (!res)
2011 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002012 if (max_char < 256)
2013 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2014 PyUnicode_1BYTE_DATA(res));
2015 else if (max_char < 0x10000)
2016 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2017 PyUnicode_2BYTE_DATA(res));
2018 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002020 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 return res;
2022}
2023
2024PyObject*
2025PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2026{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002027 if (size < 0) {
2028 PyErr_SetString(PyExc_ValueError, "size must be positive");
2029 return NULL;
2030 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002031 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002033 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002035 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002037 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002038 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002039 PyErr_SetString(PyExc_SystemError, "invalid kind");
2040 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042}
2043
Victor Stinnerece58de2012-04-23 23:36:38 +02002044Py_UCS4
2045_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2046{
2047 enum PyUnicode_Kind kind;
2048 void *startptr, *endptr;
2049
2050 assert(PyUnicode_IS_READY(unicode));
2051 assert(0 <= start);
2052 assert(end <= PyUnicode_GET_LENGTH(unicode));
2053 assert(start <= end);
2054
2055 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2056 return PyUnicode_MAX_CHAR_VALUE(unicode);
2057
2058 if (start == end)
2059 return 127;
2060
Victor Stinner94d558b2012-04-27 22:26:58 +02002061 if (PyUnicode_IS_ASCII(unicode))
2062 return 127;
2063
Victor Stinnerece58de2012-04-23 23:36:38 +02002064 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002065 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002066 endptr = (char *)startptr + end * kind;
2067 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002068 switch(kind) {
2069 case PyUnicode_1BYTE_KIND:
2070 return ucs1lib_find_max_char(startptr, endptr);
2071 case PyUnicode_2BYTE_KIND:
2072 return ucs2lib_find_max_char(startptr, endptr);
2073 case PyUnicode_4BYTE_KIND:
2074 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002075 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002076 assert(0);
2077 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002078 }
2079}
2080
Victor Stinner25a4b292011-10-06 12:31:55 +02002081/* Ensure that a string uses the most efficient storage, if it is not the
2082 case: create a new string with of the right kind. Write NULL into *p_unicode
2083 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002084static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002085unicode_adjust_maxchar(PyObject **p_unicode)
2086{
2087 PyObject *unicode, *copy;
2088 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002089 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002090 unsigned int kind;
2091
2092 assert(p_unicode != NULL);
2093 unicode = *p_unicode;
2094 assert(PyUnicode_IS_READY(unicode));
2095 if (PyUnicode_IS_ASCII(unicode))
2096 return;
2097
2098 len = PyUnicode_GET_LENGTH(unicode);
2099 kind = PyUnicode_KIND(unicode);
2100 if (kind == PyUnicode_1BYTE_KIND) {
2101 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002102 max_char = ucs1lib_find_max_char(u, u + len);
2103 if (max_char >= 128)
2104 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002105 }
2106 else if (kind == PyUnicode_2BYTE_KIND) {
2107 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002108 max_char = ucs2lib_find_max_char(u, u + len);
2109 if (max_char >= 256)
2110 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002111 }
2112 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002113 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002114 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002115 max_char = ucs4lib_find_max_char(u, u + len);
2116 if (max_char >= 0x10000)
2117 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002118 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002119 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002120 if (copy != NULL)
2121 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002122 Py_DECREF(unicode);
2123 *p_unicode = copy;
2124}
2125
Victor Stinner034f6cf2011-09-30 02:26:44 +02002126PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002127_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002128{
Victor Stinner87af4f22011-11-21 23:03:47 +01002129 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002130 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002131
Victor Stinner034f6cf2011-09-30 02:26:44 +02002132 if (!PyUnicode_Check(unicode)) {
2133 PyErr_BadInternalCall();
2134 return NULL;
2135 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002136 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002137 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002138
Victor Stinner87af4f22011-11-21 23:03:47 +01002139 length = PyUnicode_GET_LENGTH(unicode);
2140 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002141 if (!copy)
2142 return NULL;
2143 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2144
Victor Stinner87af4f22011-11-21 23:03:47 +01002145 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2146 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002147 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002148 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002149}
2150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151
Victor Stinnerbc603d12011-10-02 01:00:40 +02002152/* Widen Unicode objects to larger buffers. Don't write terminating null
2153 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002154
2155void*
2156_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2157{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002158 Py_ssize_t len;
2159 void *result;
2160 unsigned int skind;
2161
Benjamin Petersonbac79492012-01-14 13:34:47 -05002162 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002163 return NULL;
2164
2165 len = PyUnicode_GET_LENGTH(s);
2166 skind = PyUnicode_KIND(s);
2167 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002168 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 return NULL;
2170 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002171 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002172 case PyUnicode_2BYTE_KIND:
2173 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2174 if (!result)
2175 return PyErr_NoMemory();
2176 assert(skind == PyUnicode_1BYTE_KIND);
2177 _PyUnicode_CONVERT_BYTES(
2178 Py_UCS1, Py_UCS2,
2179 PyUnicode_1BYTE_DATA(s),
2180 PyUnicode_1BYTE_DATA(s) + len,
2181 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002183 case PyUnicode_4BYTE_KIND:
2184 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2185 if (!result)
2186 return PyErr_NoMemory();
2187 if (skind == PyUnicode_2BYTE_KIND) {
2188 _PyUnicode_CONVERT_BYTES(
2189 Py_UCS2, Py_UCS4,
2190 PyUnicode_2BYTE_DATA(s),
2191 PyUnicode_2BYTE_DATA(s) + len,
2192 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002194 else {
2195 assert(skind == PyUnicode_1BYTE_KIND);
2196 _PyUnicode_CONVERT_BYTES(
2197 Py_UCS1, Py_UCS4,
2198 PyUnicode_1BYTE_DATA(s),
2199 PyUnicode_1BYTE_DATA(s) + len,
2200 result);
2201 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002202 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002203 default:
2204 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 }
Victor Stinner01698042011-10-04 00:04:26 +02002206 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002207 return NULL;
2208}
2209
2210static Py_UCS4*
2211as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2212 int copy_null)
2213{
2214 int kind;
2215 void *data;
2216 Py_ssize_t len, targetlen;
2217 if (PyUnicode_READY(string) == -1)
2218 return NULL;
2219 kind = PyUnicode_KIND(string);
2220 data = PyUnicode_DATA(string);
2221 len = PyUnicode_GET_LENGTH(string);
2222 targetlen = len;
2223 if (copy_null)
2224 targetlen++;
2225 if (!target) {
2226 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2227 PyErr_NoMemory();
2228 return NULL;
2229 }
2230 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2231 if (!target) {
2232 PyErr_NoMemory();
2233 return NULL;
2234 }
2235 }
2236 else {
2237 if (targetsize < targetlen) {
2238 PyErr_Format(PyExc_SystemError,
2239 "string is longer than the buffer");
2240 if (copy_null && 0 < targetsize)
2241 target[0] = 0;
2242 return NULL;
2243 }
2244 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002245 if (kind == PyUnicode_1BYTE_KIND) {
2246 Py_UCS1 *start = (Py_UCS1 *) data;
2247 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002249 else if (kind == PyUnicode_2BYTE_KIND) {
2250 Py_UCS2 *start = (Py_UCS2 *) data;
2251 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2252 }
2253 else {
2254 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002255 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002257 if (copy_null)
2258 target[len] = 0;
2259 return target;
2260}
2261
2262Py_UCS4*
2263PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2264 int copy_null)
2265{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002266 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002267 PyErr_BadInternalCall();
2268 return NULL;
2269 }
2270 return as_ucs4(string, target, targetsize, copy_null);
2271}
2272
2273Py_UCS4*
2274PyUnicode_AsUCS4Copy(PyObject *string)
2275{
2276 return as_ucs4(string, NULL, 0, 1);
2277}
2278
2279#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002280
Alexander Belopolsky40018472011-02-26 01:02:56 +00002281PyObject *
2282PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002283{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002284 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002285 if (size == 0) {
2286 Py_INCREF(unicode_empty);
2287 return unicode_empty;
2288 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002289 PyErr_BadInternalCall();
2290 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002291 }
2292
Martin v. Löwis790465f2008-04-05 20:41:37 +00002293 if (size == -1) {
2294 size = wcslen(w);
2295 }
2296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002297 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298}
2299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002300#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002301
Walter Dörwald346737f2007-05-31 10:44:43 +00002302static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002303makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2304 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002305{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002306 *fmt++ = '%';
2307 if (width) {
2308 if (zeropad)
2309 *fmt++ = '0';
2310 fmt += sprintf(fmt, "%d", width);
2311 }
2312 if (precision)
2313 fmt += sprintf(fmt, ".%d", precision);
2314 if (longflag)
2315 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002316 else if (longlongflag) {
2317 /* longlongflag should only ever be nonzero on machines with
2318 HAVE_LONG_LONG defined */
2319#ifdef HAVE_LONG_LONG
2320 char *f = PY_FORMAT_LONG_LONG;
2321 while (*f)
2322 *fmt++ = *f++;
2323#else
2324 /* we shouldn't ever get here */
2325 assert(0);
2326 *fmt++ = 'l';
2327#endif
2328 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002329 else if (size_tflag) {
2330 char *f = PY_FORMAT_SIZE_T;
2331 while (*f)
2332 *fmt++ = *f++;
2333 }
2334 *fmt++ = c;
2335 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002336}
2337
Victor Stinner96865452011-03-01 23:44:09 +00002338/* helper for PyUnicode_FromFormatV() */
2339
2340static const char*
2341parse_format_flags(const char *f,
2342 int *p_width, int *p_precision,
2343 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2344{
2345 int width, precision, longflag, longlongflag, size_tflag;
2346
2347 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2348 f++;
2349 width = 0;
2350 while (Py_ISDIGIT((unsigned)*f))
2351 width = (width*10) + *f++ - '0';
2352 precision = 0;
2353 if (*f == '.') {
2354 f++;
2355 while (Py_ISDIGIT((unsigned)*f))
2356 precision = (precision*10) + *f++ - '0';
2357 if (*f == '%') {
2358 /* "%.3%s" => f points to "3" */
2359 f--;
2360 }
2361 }
2362 if (*f == '\0') {
2363 /* bogus format "%.1" => go backward, f points to "1" */
2364 f--;
2365 }
2366 if (p_width != NULL)
2367 *p_width = width;
2368 if (p_precision != NULL)
2369 *p_precision = precision;
2370
2371 /* Handle %ld, %lu, %lld and %llu. */
2372 longflag = 0;
2373 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002374 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002375
2376 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002377 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002378 longflag = 1;
2379 ++f;
2380 }
2381#ifdef HAVE_LONG_LONG
2382 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002383 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002384 longlongflag = 1;
2385 f += 2;
2386 }
2387#endif
2388 }
2389 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002390 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002391 size_tflag = 1;
2392 ++f;
2393 }
2394 if (p_longflag != NULL)
2395 *p_longflag = longflag;
2396 if (p_longlongflag != NULL)
2397 *p_longlongflag = longlongflag;
2398 if (p_size_tflag != NULL)
2399 *p_size_tflag = size_tflag;
2400 return f;
2401}
2402
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002403/* maximum number of characters required for output of %ld. 21 characters
2404 allows for 64-bit integers (in decimal) and an optional sign. */
2405#define MAX_LONG_CHARS 21
2406/* maximum number of characters required for output of %lld.
2407 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2408 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2409#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2410
Walter Dörwaldd2034312007-05-18 16:29:38 +00002411PyObject *
2412PyUnicode_FromFormatV(const char *format, va_list vargs)
2413{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002414 va_list count;
2415 Py_ssize_t callcount = 0;
2416 PyObject **callresults = NULL;
2417 PyObject **callresult = NULL;
2418 Py_ssize_t n = 0;
2419 int width = 0;
2420 int precision = 0;
2421 int zeropad;
2422 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002423 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002424 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002425 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2427 Py_UCS4 argmaxchar;
2428 Py_ssize_t numbersize = 0;
2429 char *numberresults = NULL;
2430 char *numberresult = NULL;
2431 Py_ssize_t i;
2432 int kind;
2433 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002434
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002435 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002436 /* step 1: count the number of %S/%R/%A/%s format specifications
2437 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2438 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002440 * also estimate a upper bound for all the number formats in the string,
2441 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002442 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002443 for (f = format; *f; f++) {
2444 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002445 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2447 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2448 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2449 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002452#ifdef HAVE_LONG_LONG
2453 if (longlongflag) {
2454 if (width < MAX_LONG_LONG_CHARS)
2455 width = MAX_LONG_LONG_CHARS;
2456 }
2457 else
2458#endif
2459 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2460 including sign. Decimal takes the most space. This
2461 isn't enough for octal. If a width is specified we
2462 need more (which we allocate later). */
2463 if (width < MAX_LONG_CHARS)
2464 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002465
2466 /* account for the size + '\0' to separate numbers
2467 inside of the numberresults buffer */
2468 numbersize += (width + 1);
2469 }
2470 }
2471 else if ((unsigned char)*f > 127) {
2472 PyErr_Format(PyExc_ValueError,
2473 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2474 "string, got a non-ASCII byte: 0x%02x",
2475 (unsigned char)*f);
2476 return NULL;
2477 }
2478 }
2479 /* step 2: allocate memory for the results of
2480 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2481 if (callcount) {
2482 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2483 if (!callresults) {
2484 PyErr_NoMemory();
2485 return NULL;
2486 }
2487 callresult = callresults;
2488 }
2489 /* step 2.5: allocate memory for the results of formating numbers */
2490 if (numbersize) {
2491 numberresults = PyObject_Malloc(numbersize);
2492 if (!numberresults) {
2493 PyErr_NoMemory();
2494 goto fail;
2495 }
2496 numberresult = numberresults;
2497 }
2498
2499 /* step 3: format numbers and figure out how large a buffer we need */
2500 for (f = format; *f; f++) {
2501 if (*f == '%') {
2502 const char* p;
2503 int longflag;
2504 int longlongflag;
2505 int size_tflag;
2506 int numprinted;
2507
2508 p = f;
2509 zeropad = (f[1] == '0');
2510 f = parse_format_flags(f, &width, &precision,
2511 &longflag, &longlongflag, &size_tflag);
2512 switch (*f) {
2513 case 'c':
2514 {
2515 Py_UCS4 ordinal = va_arg(count, int);
Victor Stinnere6abb482012-05-02 01:15:40 +02002516 maxchar = MAX_MAXCHAR(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002517 n++;
2518 break;
2519 }
2520 case '%':
2521 n++;
2522 break;
2523 case 'i':
2524 case 'd':
2525 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2526 width, precision, *f);
2527 if (longflag)
2528 numprinted = sprintf(numberresult, fmt,
2529 va_arg(count, long));
2530#ifdef HAVE_LONG_LONG
2531 else if (longlongflag)
2532 numprinted = sprintf(numberresult, fmt,
2533 va_arg(count, PY_LONG_LONG));
2534#endif
2535 else if (size_tflag)
2536 numprinted = sprintf(numberresult, fmt,
2537 va_arg(count, Py_ssize_t));
2538 else
2539 numprinted = sprintf(numberresult, fmt,
2540 va_arg(count, int));
2541 n += numprinted;
2542 /* advance by +1 to skip over the '\0' */
2543 numberresult += (numprinted + 1);
2544 assert(*(numberresult - 1) == '\0');
2545 assert(*(numberresult - 2) != '\0');
2546 assert(numprinted >= 0);
2547 assert(numberresult <= numberresults + numbersize);
2548 break;
2549 case 'u':
2550 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2551 width, precision, 'u');
2552 if (longflag)
2553 numprinted = sprintf(numberresult, fmt,
2554 va_arg(count, unsigned long));
2555#ifdef HAVE_LONG_LONG
2556 else if (longlongflag)
2557 numprinted = sprintf(numberresult, fmt,
2558 va_arg(count, unsigned PY_LONG_LONG));
2559#endif
2560 else if (size_tflag)
2561 numprinted = sprintf(numberresult, fmt,
2562 va_arg(count, size_t));
2563 else
2564 numprinted = sprintf(numberresult, fmt,
2565 va_arg(count, unsigned int));
2566 n += numprinted;
2567 numberresult += (numprinted + 1);
2568 assert(*(numberresult - 1) == '\0');
2569 assert(*(numberresult - 2) != '\0');
2570 assert(numprinted >= 0);
2571 assert(numberresult <= numberresults + numbersize);
2572 break;
2573 case 'x':
2574 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2575 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2576 n += numprinted;
2577 numberresult += (numprinted + 1);
2578 assert(*(numberresult - 1) == '\0');
2579 assert(*(numberresult - 2) != '\0');
2580 assert(numprinted >= 0);
2581 assert(numberresult <= numberresults + numbersize);
2582 break;
2583 case 'p':
2584 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2585 /* %p is ill-defined: ensure leading 0x. */
2586 if (numberresult[1] == 'X')
2587 numberresult[1] = 'x';
2588 else if (numberresult[1] != 'x') {
2589 memmove(numberresult + 2, numberresult,
2590 strlen(numberresult) + 1);
2591 numberresult[0] = '0';
2592 numberresult[1] = 'x';
2593 numprinted += 2;
2594 }
2595 n += numprinted;
2596 numberresult += (numprinted + 1);
2597 assert(*(numberresult - 1) == '\0');
2598 assert(*(numberresult - 2) != '\0');
2599 assert(numprinted >= 0);
2600 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002601 break;
2602 case 's':
2603 {
2604 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002605 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002606 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002607 if (!str)
2608 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609 /* since PyUnicode_DecodeUTF8 returns already flexible
2610 unicode objects, there is no need to call ready on them */
2611 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002612 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002613 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002614 /* Remember the str and switch to the next slot */
2615 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002616 break;
2617 }
2618 case 'U':
2619 {
2620 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002621 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 if (PyUnicode_READY(obj) == -1)
2623 goto fail;
2624 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002625 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002626 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 break;
2628 }
2629 case 'V':
2630 {
2631 PyObject *obj = va_arg(count, PyObject *);
2632 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002633 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002635 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002636 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 if (PyUnicode_READY(obj) == -1)
2638 goto fail;
2639 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002640 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002641 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002642 *callresult++ = NULL;
2643 }
2644 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002645 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002646 if (!str_obj)
2647 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002648 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002649 Py_DECREF(str_obj);
2650 goto fail;
2651 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002652 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002653 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002654 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002655 *callresult++ = str_obj;
2656 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002657 break;
2658 }
2659 case 'S':
2660 {
2661 PyObject *obj = va_arg(count, PyObject *);
2662 PyObject *str;
2663 assert(obj);
2664 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002665 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002666 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002667 if (PyUnicode_READY(str) == -1) {
2668 Py_DECREF(str);
2669 goto fail;
2670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002671 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002672 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002674 /* Remember the str and switch to the next slot */
2675 *callresult++ = str;
2676 break;
2677 }
2678 case 'R':
2679 {
2680 PyObject *obj = va_arg(count, PyObject *);
2681 PyObject *repr;
2682 assert(obj);
2683 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002684 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002685 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002686 if (PyUnicode_READY(repr) == -1) {
2687 Py_DECREF(repr);
2688 goto fail;
2689 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Victor Stinnere6abb482012-05-02 01:15:40 +02002691 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002692 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002693 /* Remember the repr and switch to the next slot */
2694 *callresult++ = repr;
2695 break;
2696 }
2697 case 'A':
2698 {
2699 PyObject *obj = va_arg(count, PyObject *);
2700 PyObject *ascii;
2701 assert(obj);
2702 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002703 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002704 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002705 if (PyUnicode_READY(ascii) == -1) {
2706 Py_DECREF(ascii);
2707 goto fail;
2708 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002709 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Victor Stinnere6abb482012-05-02 01:15:40 +02002710 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002712 /* Remember the repr and switch to the next slot */
2713 *callresult++ = ascii;
2714 break;
2715 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002716 default:
2717 /* if we stumble upon an unknown
2718 formatting code, copy the rest of
2719 the format string to the output
2720 string. (we cannot just skip the
2721 code, since there's no way to know
2722 what's in the argument list) */
2723 n += strlen(p);
2724 goto expand;
2725 }
2726 } else
2727 n++;
2728 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002729 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002730 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002731 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002732 we don't have to resize the string.
2733 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002734 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002735 if (!string)
2736 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002737 kind = PyUnicode_KIND(string);
2738 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002739 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002740 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002741
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002742 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002743 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002744 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002745
2746 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002747 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2748 /* checking for == because the last argument could be a empty
2749 string, which causes i to point to end, the assert at the end of
2750 the loop */
2751 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002752
Benjamin Peterson14339b62009-01-31 16:36:08 +00002753 switch (*f) {
2754 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002755 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002756 const int ordinal = va_arg(vargs, int);
2757 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002758 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002759 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002760 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002761 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002762 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002763 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002764 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002765 {
Victor Stinner184252a2012-06-16 02:57:41 +02002766 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002767 /* unused, since we already have the result */
2768 if (*f == 'p')
2769 (void) va_arg(vargs, void *);
2770 else
2771 (void) va_arg(vargs, int);
2772 /* extract the result from numberresults and append. */
Victor Stinner184252a2012-06-16 02:57:41 +02002773 len = strlen(numberresult);
2774 unicode_write_cstr(string, i, numberresult, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 /* skip over the separating '\0' */
Victor Stinner184252a2012-06-16 02:57:41 +02002776 i += len;
2777 numberresult += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002778 assert(*numberresult == '\0');
2779 numberresult++;
2780 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002781 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002782 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002783 case 's':
2784 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002785 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002786 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002787 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002788 size = PyUnicode_GET_LENGTH(*callresult);
2789 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002790 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002791 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002792 /* We're done with the unicode()/repr() => forget it */
2793 Py_DECREF(*callresult);
2794 /* switch to next unicode()/repr() result */
2795 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002796 break;
2797 }
2798 case 'U':
2799 {
2800 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002801 Py_ssize_t size;
2802 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2803 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerd3f08822012-05-29 12:57:52 +02002804 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002805 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002806 break;
2807 }
2808 case 'V':
2809 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002811 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002812 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002813 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002814 size = PyUnicode_GET_LENGTH(obj);
2815 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002816 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002817 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002818 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 size = PyUnicode_GET_LENGTH(*callresult);
2820 assert(PyUnicode_KIND(*callresult) <=
2821 PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002822 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002823 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002824 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002825 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002826 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002827 break;
2828 }
2829 case 'S':
2830 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002831 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002832 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002833 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002834 /* unused, since we already have the result */
2835 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002836 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002837 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002838 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002839 /* We're done with the unicode()/repr() => forget it */
2840 Py_DECREF(*callresult);
2841 /* switch to next unicode()/repr() result */
2842 ++callresult;
2843 break;
2844 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002845 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002846 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002847 break;
2848 default:
Victor Stinner184252a2012-06-16 02:57:41 +02002849 {
2850 Py_ssize_t len = strlen(p);
2851 unicode_write_cstr(string, i, p, len);
2852 i += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002853 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002854 goto end;
2855 }
Victor Stinner184252a2012-06-16 02:57:41 +02002856 }
Victor Stinner1205f272010-09-11 00:54:47 +00002857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002858 else {
2859 assert(i < PyUnicode_GET_LENGTH(string));
2860 PyUnicode_WRITE(kind, data, i++, *f);
2861 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002863 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002864
Benjamin Peterson29060642009-01-31 22:14:21 +00002865 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002866 if (callresults)
2867 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002868 if (numberresults)
2869 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002870 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002871 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002872 if (callresults) {
2873 PyObject **callresult2 = callresults;
2874 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002875 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002876 ++callresult2;
2877 }
2878 PyObject_Free(callresults);
2879 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002880 if (numberresults)
2881 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002882 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002883}
2884
Walter Dörwaldd2034312007-05-18 16:29:38 +00002885PyObject *
2886PyUnicode_FromFormat(const char *format, ...)
2887{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002888 PyObject* ret;
2889 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002890
2891#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002892 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002893#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002894 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002895#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002896 ret = PyUnicode_FromFormatV(format, vargs);
2897 va_end(vargs);
2898 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002899}
2900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002901#ifdef HAVE_WCHAR_H
2902
Victor Stinner5593d8a2010-10-02 11:11:27 +00002903/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2904 convert a Unicode object to a wide character string.
2905
Victor Stinnerd88d9832011-09-06 02:00:05 +02002906 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002907 character) required to convert the unicode object. Ignore size argument.
2908
Victor Stinnerd88d9832011-09-06 02:00:05 +02002909 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002910 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002911 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002912static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002913unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002914 wchar_t *w,
2915 Py_ssize_t size)
2916{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002917 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002918 const wchar_t *wstr;
2919
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002920 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002921 if (wstr == NULL)
2922 return -1;
2923
Victor Stinner5593d8a2010-10-02 11:11:27 +00002924 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002925 if (size > res)
2926 size = res + 1;
2927 else
2928 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002929 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002930 return res;
2931 }
2932 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002933 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002934}
2935
2936Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002937PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002938 wchar_t *w,
2939 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002940{
2941 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002942 PyErr_BadInternalCall();
2943 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002945 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002946}
2947
Victor Stinner137c34c2010-09-29 10:25:54 +00002948wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002949PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002950 Py_ssize_t *size)
2951{
2952 wchar_t* buffer;
2953 Py_ssize_t buflen;
2954
2955 if (unicode == NULL) {
2956 PyErr_BadInternalCall();
2957 return NULL;
2958 }
2959
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002960 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002961 if (buflen == -1)
2962 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002963 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002964 PyErr_NoMemory();
2965 return NULL;
2966 }
2967
Victor Stinner137c34c2010-09-29 10:25:54 +00002968 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2969 if (buffer == NULL) {
2970 PyErr_NoMemory();
2971 return NULL;
2972 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002973 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002974 if (buflen == -1) {
2975 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002976 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002977 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002978 if (size != NULL)
2979 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002980 return buffer;
2981}
2982
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002983#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984
Alexander Belopolsky40018472011-02-26 01:02:56 +00002985PyObject *
2986PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002987{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002988 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002989 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002990 PyErr_SetString(PyExc_ValueError,
2991 "chr() arg not in range(0x110000)");
2992 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002993 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002995 if (ordinal < 256)
2996 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002998 v = PyUnicode_New(1, ordinal);
2999 if (v == NULL)
3000 return NULL;
3001 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003002 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003003 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003004}
3005
Alexander Belopolsky40018472011-02-26 01:02:56 +00003006PyObject *
3007PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003008{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003009 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003010 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003011 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003012 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003013 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003014 Py_INCREF(obj);
3015 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003016 }
3017 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003018 /* For a Unicode subtype that's not a Unicode object,
3019 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003020 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003021 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003022 PyErr_Format(PyExc_TypeError,
3023 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003024 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003025 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003026}
3027
Alexander Belopolsky40018472011-02-26 01:02:56 +00003028PyObject *
3029PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003030 const char *encoding,
3031 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003032{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003033 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003034 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003035
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003037 PyErr_BadInternalCall();
3038 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003039 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003040
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003041 /* Decoding bytes objects is the most common case and should be fast */
3042 if (PyBytes_Check(obj)) {
3043 if (PyBytes_GET_SIZE(obj) == 0) {
3044 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003045 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003046 }
3047 else {
3048 v = PyUnicode_Decode(
3049 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3050 encoding, errors);
3051 }
3052 return v;
3053 }
3054
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003055 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003056 PyErr_SetString(PyExc_TypeError,
3057 "decoding str is not supported");
3058 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003059 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003060
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003061 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3062 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3063 PyErr_Format(PyExc_TypeError,
3064 "coercing to str: need bytes, bytearray "
3065 "or buffer-like object, %.80s found",
3066 Py_TYPE(obj)->tp_name);
3067 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003068 }
Tim Petersced69f82003-09-16 20:30:58 +00003069
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003070 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003071 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003072 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073 }
Tim Petersced69f82003-09-16 20:30:58 +00003074 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003075 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003076
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003077 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003078 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079}
3080
Victor Stinner600d3be2010-06-10 12:00:55 +00003081/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003082 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3083 1 on success. */
3084static int
3085normalize_encoding(const char *encoding,
3086 char *lower,
3087 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003089 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003090 char *l;
3091 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003092
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003093 if (encoding == NULL) {
3094 strcpy(lower, "utf-8");
3095 return 1;
3096 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003097 e = encoding;
3098 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003099 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003100 while (*e) {
3101 if (l == l_end)
3102 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003103 if (Py_ISUPPER(*e)) {
3104 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003105 }
3106 else if (*e == '_') {
3107 *l++ = '-';
3108 e++;
3109 }
3110 else {
3111 *l++ = *e++;
3112 }
3113 }
3114 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003115 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003116}
3117
Alexander Belopolsky40018472011-02-26 01:02:56 +00003118PyObject *
3119PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003120 Py_ssize_t size,
3121 const char *encoding,
3122 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003123{
3124 PyObject *buffer = NULL, *unicode;
3125 Py_buffer info;
3126 char lower[11]; /* Enough for any encoding shortcut */
3127
Fred Drakee4315f52000-05-09 19:53:39 +00003128 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003129 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003130 if ((strcmp(lower, "utf-8") == 0) ||
3131 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003132 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003133 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003134 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003135 (strcmp(lower, "iso-8859-1") == 0))
3136 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003137#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003138 else if (strcmp(lower, "mbcs") == 0)
3139 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003140#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003141 else if (strcmp(lower, "ascii") == 0)
3142 return PyUnicode_DecodeASCII(s, size, errors);
3143 else if (strcmp(lower, "utf-16") == 0)
3144 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3145 else if (strcmp(lower, "utf-32") == 0)
3146 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003148
3149 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003150 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003151 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003152 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003153 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154 if (buffer == NULL)
3155 goto onError;
3156 unicode = PyCodec_Decode(buffer, encoding, errors);
3157 if (unicode == NULL)
3158 goto onError;
3159 if (!PyUnicode_Check(unicode)) {
3160 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003161 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003162 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163 Py_DECREF(unicode);
3164 goto onError;
3165 }
3166 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003167 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003168
Benjamin Peterson29060642009-01-31 22:14:21 +00003169 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170 Py_XDECREF(buffer);
3171 return NULL;
3172}
3173
Alexander Belopolsky40018472011-02-26 01:02:56 +00003174PyObject *
3175PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003176 const char *encoding,
3177 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003178{
3179 PyObject *v;
3180
3181 if (!PyUnicode_Check(unicode)) {
3182 PyErr_BadArgument();
3183 goto onError;
3184 }
3185
3186 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003187 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003188
3189 /* Decode via the codec registry */
3190 v = PyCodec_Decode(unicode, encoding, errors);
3191 if (v == NULL)
3192 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003193 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003194
Benjamin Peterson29060642009-01-31 22:14:21 +00003195 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003196 return NULL;
3197}
3198
Alexander Belopolsky40018472011-02-26 01:02:56 +00003199PyObject *
3200PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003201 const char *encoding,
3202 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003203{
3204 PyObject *v;
3205
3206 if (!PyUnicode_Check(unicode)) {
3207 PyErr_BadArgument();
3208 goto onError;
3209 }
3210
3211 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003212 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003213
3214 /* Decode via the codec registry */
3215 v = PyCodec_Decode(unicode, encoding, errors);
3216 if (v == NULL)
3217 goto onError;
3218 if (!PyUnicode_Check(v)) {
3219 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003220 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003221 Py_TYPE(v)->tp_name);
3222 Py_DECREF(v);
3223 goto onError;
3224 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003225 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003226
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003228 return NULL;
3229}
3230
Alexander Belopolsky40018472011-02-26 01:02:56 +00003231PyObject *
3232PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003233 Py_ssize_t size,
3234 const char *encoding,
3235 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236{
3237 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003238
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 unicode = PyUnicode_FromUnicode(s, size);
3240 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003241 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3243 Py_DECREF(unicode);
3244 return v;
3245}
3246
Alexander Belopolsky40018472011-02-26 01:02:56 +00003247PyObject *
3248PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003249 const char *encoding,
3250 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003251{
3252 PyObject *v;
3253
3254 if (!PyUnicode_Check(unicode)) {
3255 PyErr_BadArgument();
3256 goto onError;
3257 }
3258
3259 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003260 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003261
3262 /* Encode via the codec registry */
3263 v = PyCodec_Encode(unicode, encoding, errors);
3264 if (v == NULL)
3265 goto onError;
3266 return v;
3267
Benjamin Peterson29060642009-01-31 22:14:21 +00003268 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003269 return NULL;
3270}
3271
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003272static size_t
3273wcstombs_errorpos(const wchar_t *wstr)
3274{
3275 size_t len;
3276#if SIZEOF_WCHAR_T == 2
3277 wchar_t buf[3];
3278#else
3279 wchar_t buf[2];
3280#endif
3281 char outbuf[MB_LEN_MAX];
3282 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003283
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003284#if SIZEOF_WCHAR_T == 2
3285 buf[2] = 0;
3286#else
3287 buf[1] = 0;
3288#endif
3289 start = wstr;
3290 while (*wstr != L'\0')
3291 {
3292 previous = wstr;
3293#if SIZEOF_WCHAR_T == 2
3294 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3295 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3296 {
3297 buf[0] = wstr[0];
3298 buf[1] = wstr[1];
3299 wstr += 2;
3300 }
3301 else {
3302 buf[0] = *wstr;
3303 buf[1] = 0;
3304 wstr++;
3305 }
3306#else
3307 buf[0] = *wstr;
3308 wstr++;
3309#endif
3310 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003311 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003312 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003313 }
3314
3315 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003316 return 0;
3317}
3318
Victor Stinner1b579672011-12-17 05:47:23 +01003319static int
3320locale_error_handler(const char *errors, int *surrogateescape)
3321{
3322 if (errors == NULL) {
3323 *surrogateescape = 0;
3324 return 0;
3325 }
3326
3327 if (strcmp(errors, "strict") == 0) {
3328 *surrogateescape = 0;
3329 return 0;
3330 }
3331 if (strcmp(errors, "surrogateescape") == 0) {
3332 *surrogateescape = 1;
3333 return 0;
3334 }
3335 PyErr_Format(PyExc_ValueError,
3336 "only 'strict' and 'surrogateescape' error handlers "
3337 "are supported, not '%s'",
3338 errors);
3339 return -1;
3340}
3341
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003342PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003343PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003344{
3345 Py_ssize_t wlen, wlen2;
3346 wchar_t *wstr;
3347 PyObject *bytes = NULL;
3348 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003349 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003350 PyObject *exc;
3351 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003352 int surrogateescape;
3353
3354 if (locale_error_handler(errors, &surrogateescape) < 0)
3355 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003356
3357 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3358 if (wstr == NULL)
3359 return NULL;
3360
3361 wlen2 = wcslen(wstr);
3362 if (wlen2 != wlen) {
3363 PyMem_Free(wstr);
3364 PyErr_SetString(PyExc_TypeError, "embedded null character");
3365 return NULL;
3366 }
3367
3368 if (surrogateescape) {
3369 /* locale encoding with surrogateescape */
3370 char *str;
3371
3372 str = _Py_wchar2char(wstr, &error_pos);
3373 if (str == NULL) {
3374 if (error_pos == (size_t)-1) {
3375 PyErr_NoMemory();
3376 PyMem_Free(wstr);
3377 return NULL;
3378 }
3379 else {
3380 goto encode_error;
3381 }
3382 }
3383 PyMem_Free(wstr);
3384
3385 bytes = PyBytes_FromString(str);
3386 PyMem_Free(str);
3387 }
3388 else {
3389 size_t len, len2;
3390
3391 len = wcstombs(NULL, wstr, 0);
3392 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003393 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003394 goto encode_error;
3395 }
3396
3397 bytes = PyBytes_FromStringAndSize(NULL, len);
3398 if (bytes == NULL) {
3399 PyMem_Free(wstr);
3400 return NULL;
3401 }
3402
3403 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3404 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003405 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003406 goto encode_error;
3407 }
3408 PyMem_Free(wstr);
3409 }
3410 return bytes;
3411
3412encode_error:
3413 errmsg = strerror(errno);
3414 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003415
3416 if (error_pos == (size_t)-1)
3417 error_pos = wcstombs_errorpos(wstr);
3418
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003419 PyMem_Free(wstr);
3420 Py_XDECREF(bytes);
3421
Victor Stinner2f197072011-12-17 07:08:30 +01003422 if (errmsg != NULL) {
3423 size_t errlen;
3424 wstr = _Py_char2wchar(errmsg, &errlen);
3425 if (wstr != NULL) {
3426 reason = PyUnicode_FromWideChar(wstr, errlen);
3427 PyMem_Free(wstr);
3428 } else
3429 errmsg = NULL;
3430 }
3431 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003432 reason = PyUnicode_FromString(
3433 "wcstombs() encountered an unencodable "
3434 "wide character");
3435 if (reason == NULL)
3436 return NULL;
3437
3438 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3439 "locale", unicode,
3440 (Py_ssize_t)error_pos,
3441 (Py_ssize_t)(error_pos+1),
3442 reason);
3443 Py_DECREF(reason);
3444 if (exc != NULL) {
3445 PyCodec_StrictErrors(exc);
3446 Py_XDECREF(exc);
3447 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003448 return NULL;
3449}
3450
Victor Stinnerad158722010-10-27 00:25:46 +00003451PyObject *
3452PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003453{
Victor Stinner99b95382011-07-04 14:23:54 +02003454#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003455 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003456#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003457 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003458#else
Victor Stinner793b5312011-04-27 00:24:21 +02003459 PyInterpreterState *interp = PyThreadState_GET()->interp;
3460 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3461 cannot use it to encode and decode filenames before it is loaded. Load
3462 the Python codec requires to encode at least its own filename. Use the C
3463 version of the locale codec until the codec registry is initialized and
3464 the Python codec is loaded.
3465
3466 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3467 cannot only rely on it: check also interp->fscodec_initialized for
3468 subinterpreters. */
3469 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003470 return PyUnicode_AsEncodedString(unicode,
3471 Py_FileSystemDefaultEncoding,
3472 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003473 }
3474 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003475 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003476 }
Victor Stinnerad158722010-10-27 00:25:46 +00003477#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003478}
3479
Alexander Belopolsky40018472011-02-26 01:02:56 +00003480PyObject *
3481PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003482 const char *encoding,
3483 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484{
3485 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003486 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003487
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488 if (!PyUnicode_Check(unicode)) {
3489 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491 }
Fred Drakee4315f52000-05-09 19:53:39 +00003492
Fred Drakee4315f52000-05-09 19:53:39 +00003493 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003494 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003495 if ((strcmp(lower, "utf-8") == 0) ||
3496 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003497 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003498 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003499 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003500 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003501 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003502 }
Victor Stinner37296e82010-06-10 13:36:23 +00003503 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003504 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003505 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003506 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003507#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003508 else if (strcmp(lower, "mbcs") == 0)
3509 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003510#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003511 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003512 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003513 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514
3515 /* Encode via the codec registry */
3516 v = PyCodec_Encode(unicode, encoding, errors);
3517 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003518 return NULL;
3519
3520 /* The normal path */
3521 if (PyBytes_Check(v))
3522 return v;
3523
3524 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003525 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003526 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003527 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003528
3529 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3530 "encoder %s returned bytearray instead of bytes",
3531 encoding);
3532 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003533 Py_DECREF(v);
3534 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003535 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003536
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003537 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3538 Py_DECREF(v);
3539 return b;
3540 }
3541
3542 PyErr_Format(PyExc_TypeError,
3543 "encoder did not return a bytes object (type=%.400s)",
3544 Py_TYPE(v)->tp_name);
3545 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003546 return NULL;
3547}
3548
Alexander Belopolsky40018472011-02-26 01:02:56 +00003549PyObject *
3550PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003551 const char *encoding,
3552 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003553{
3554 PyObject *v;
3555
3556 if (!PyUnicode_Check(unicode)) {
3557 PyErr_BadArgument();
3558 goto onError;
3559 }
3560
3561 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003562 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003563
3564 /* Encode via the codec registry */
3565 v = PyCodec_Encode(unicode, encoding, errors);
3566 if (v == NULL)
3567 goto onError;
3568 if (!PyUnicode_Check(v)) {
3569 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003570 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003571 Py_TYPE(v)->tp_name);
3572 Py_DECREF(v);
3573 goto onError;
3574 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003575 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003576
Benjamin Peterson29060642009-01-31 22:14:21 +00003577 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578 return NULL;
3579}
3580
Victor Stinner2f197072011-12-17 07:08:30 +01003581static size_t
3582mbstowcs_errorpos(const char *str, size_t len)
3583{
3584#ifdef HAVE_MBRTOWC
3585 const char *start = str;
3586 mbstate_t mbs;
3587 size_t converted;
3588 wchar_t ch;
3589
3590 memset(&mbs, 0, sizeof mbs);
3591 while (len)
3592 {
3593 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3594 if (converted == 0)
3595 /* Reached end of string */
3596 break;
3597 if (converted == (size_t)-1 || converted == (size_t)-2) {
3598 /* Conversion error or incomplete character */
3599 return str - start;
3600 }
3601 else {
3602 str += converted;
3603 len -= converted;
3604 }
3605 }
3606 /* failed to find the undecodable byte sequence */
3607 return 0;
3608#endif
3609 return 0;
3610}
3611
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003612PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003613PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003614 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003615{
3616 wchar_t smallbuf[256];
3617 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3618 wchar_t *wstr;
3619 size_t wlen, wlen2;
3620 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003621 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003622 size_t error_pos;
3623 char *errmsg;
3624 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003625
3626 if (locale_error_handler(errors, &surrogateescape) < 0)
3627 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003628
3629 if (str[len] != '\0' || len != strlen(str)) {
3630 PyErr_SetString(PyExc_TypeError, "embedded null character");
3631 return NULL;
3632 }
3633
3634 if (surrogateescape)
3635 {
3636 wstr = _Py_char2wchar(str, &wlen);
3637 if (wstr == NULL) {
3638 if (wlen == (size_t)-1)
3639 PyErr_NoMemory();
3640 else
3641 PyErr_SetFromErrno(PyExc_OSError);
3642 return NULL;
3643 }
3644
3645 unicode = PyUnicode_FromWideChar(wstr, wlen);
3646 PyMem_Free(wstr);
3647 }
3648 else {
3649#ifndef HAVE_BROKEN_MBSTOWCS
3650 wlen = mbstowcs(NULL, str, 0);
3651#else
3652 wlen = len;
3653#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003654 if (wlen == (size_t)-1)
3655 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003656 if (wlen+1 <= smallbuf_len) {
3657 wstr = smallbuf;
3658 }
3659 else {
3660 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3661 return PyErr_NoMemory();
3662
3663 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3664 if (!wstr)
3665 return PyErr_NoMemory();
3666 }
3667
3668 /* This shouldn't fail now */
3669 wlen2 = mbstowcs(wstr, str, wlen+1);
3670 if (wlen2 == (size_t)-1) {
3671 if (wstr != smallbuf)
3672 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003673 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003674 }
3675#ifdef HAVE_BROKEN_MBSTOWCS
3676 assert(wlen2 == wlen);
3677#endif
3678 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3679 if (wstr != smallbuf)
3680 PyMem_Free(wstr);
3681 }
3682 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003683
3684decode_error:
3685 errmsg = strerror(errno);
3686 assert(errmsg != NULL);
3687
3688 error_pos = mbstowcs_errorpos(str, len);
3689 if (errmsg != NULL) {
3690 size_t errlen;
3691 wstr = _Py_char2wchar(errmsg, &errlen);
3692 if (wstr != NULL) {
3693 reason = PyUnicode_FromWideChar(wstr, errlen);
3694 PyMem_Free(wstr);
3695 } else
3696 errmsg = NULL;
3697 }
3698 if (errmsg == NULL)
3699 reason = PyUnicode_FromString(
3700 "mbstowcs() encountered an invalid multibyte sequence");
3701 if (reason == NULL)
3702 return NULL;
3703
3704 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3705 "locale", str, len,
3706 (Py_ssize_t)error_pos,
3707 (Py_ssize_t)(error_pos+1),
3708 reason);
3709 Py_DECREF(reason);
3710 if (exc != NULL) {
3711 PyCodec_StrictErrors(exc);
3712 Py_XDECREF(exc);
3713 }
3714 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003715}
3716
3717PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003718PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003719{
3720 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003721 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003722}
3723
3724
3725PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003726PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003727 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003728 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3729}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003730
Christian Heimes5894ba72007-11-04 11:43:14 +00003731PyObject*
3732PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3733{
Victor Stinner99b95382011-07-04 14:23:54 +02003734#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003735 return PyUnicode_DecodeMBCS(s, size, NULL);
3736#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003737 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003738#else
Victor Stinner793b5312011-04-27 00:24:21 +02003739 PyInterpreterState *interp = PyThreadState_GET()->interp;
3740 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3741 cannot use it to encode and decode filenames before it is loaded. Load
3742 the Python codec requires to encode at least its own filename. Use the C
3743 version of the locale codec until the codec registry is initialized and
3744 the Python codec is loaded.
3745
3746 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3747 cannot only rely on it: check also interp->fscodec_initialized for
3748 subinterpreters. */
3749 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003750 return PyUnicode_Decode(s, size,
3751 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003752 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003753 }
3754 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003755 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003756 }
Victor Stinnerad158722010-10-27 00:25:46 +00003757#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003758}
3759
Martin v. Löwis011e8422009-05-05 04:43:17 +00003760
3761int
Antoine Pitrou13348842012-01-29 18:36:34 +01003762_PyUnicode_HasNULChars(PyObject* s)
3763{
3764 static PyObject *nul = NULL;
3765
3766 if (nul == NULL)
3767 nul = PyUnicode_FromStringAndSize("\0", 1);
3768 if (nul == NULL)
3769 return -1;
3770 return PyUnicode_Contains(s, nul);
3771}
3772
3773
3774int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003775PyUnicode_FSConverter(PyObject* arg, void* addr)
3776{
3777 PyObject *output = NULL;
3778 Py_ssize_t size;
3779 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003780 if (arg == NULL) {
3781 Py_DECREF(*(PyObject**)addr);
3782 return 1;
3783 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003784 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003785 output = arg;
3786 Py_INCREF(output);
3787 }
3788 else {
3789 arg = PyUnicode_FromObject(arg);
3790 if (!arg)
3791 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003792 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003793 Py_DECREF(arg);
3794 if (!output)
3795 return 0;
3796 if (!PyBytes_Check(output)) {
3797 Py_DECREF(output);
3798 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3799 return 0;
3800 }
3801 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003802 size = PyBytes_GET_SIZE(output);
3803 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003804 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003805 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003806 Py_DECREF(output);
3807 return 0;
3808 }
3809 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003810 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003811}
3812
3813
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003814int
3815PyUnicode_FSDecoder(PyObject* arg, void* addr)
3816{
3817 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003818 if (arg == NULL) {
3819 Py_DECREF(*(PyObject**)addr);
3820 return 1;
3821 }
3822 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003823 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003824 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003825 output = arg;
3826 Py_INCREF(output);
3827 }
3828 else {
3829 arg = PyBytes_FromObject(arg);
3830 if (!arg)
3831 return 0;
3832 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3833 PyBytes_GET_SIZE(arg));
3834 Py_DECREF(arg);
3835 if (!output)
3836 return 0;
3837 if (!PyUnicode_Check(output)) {
3838 Py_DECREF(output);
3839 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3840 return 0;
3841 }
3842 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003843 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003844 Py_DECREF(output);
3845 return 0;
3846 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003847 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003848 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003849 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3850 Py_DECREF(output);
3851 return 0;
3852 }
3853 *(PyObject**)addr = output;
3854 return Py_CLEANUP_SUPPORTED;
3855}
3856
3857
Martin v. Löwis5b222132007-06-10 09:51:05 +00003858char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003860{
Christian Heimesf3863112007-11-22 07:46:41 +00003861 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003862
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003863 if (!PyUnicode_Check(unicode)) {
3864 PyErr_BadArgument();
3865 return NULL;
3866 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003867 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003868 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003869
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003870 if (PyUnicode_UTF8(unicode) == NULL) {
3871 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003872 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3873 if (bytes == NULL)
3874 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003875 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3876 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003877 Py_DECREF(bytes);
3878 return NULL;
3879 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003880 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3881 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3882 PyBytes_AS_STRING(bytes),
3883 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884 Py_DECREF(bytes);
3885 }
3886
3887 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003888 *psize = PyUnicode_UTF8_LENGTH(unicode);
3889 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003890}
3891
3892char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003894{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3896}
3897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003898Py_UNICODE *
3899PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3900{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003901 const unsigned char *one_byte;
3902#if SIZEOF_WCHAR_T == 4
3903 const Py_UCS2 *two_bytes;
3904#else
3905 const Py_UCS4 *four_bytes;
3906 const Py_UCS4 *ucs4_end;
3907 Py_ssize_t num_surrogates;
3908#endif
3909 wchar_t *w;
3910 wchar_t *wchar_end;
3911
3912 if (!PyUnicode_Check(unicode)) {
3913 PyErr_BadArgument();
3914 return NULL;
3915 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003916 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003917 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003918 assert(_PyUnicode_KIND(unicode) != 0);
3919 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003921 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003923 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3924 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003925 num_surrogates = 0;
3926
3927 for (; four_bytes < ucs4_end; ++four_bytes) {
3928 if (*four_bytes > 0xFFFF)
3929 ++num_surrogates;
3930 }
3931
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003932 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3933 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3934 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003935 PyErr_NoMemory();
3936 return NULL;
3937 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003938 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003939
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003940 w = _PyUnicode_WSTR(unicode);
3941 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3942 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3944 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003945 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003946 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003947 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3948 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949 }
3950 else
3951 *w = *four_bytes;
3952
3953 if (w > wchar_end) {
3954 assert(0 && "Miscalculated string end");
3955 }
3956 }
3957 *w = 0;
3958#else
3959 /* sizeof(wchar_t) == 4 */
3960 Py_FatalError("Impossible unicode object state, wstr and str "
3961 "should share memory already.");
3962 return NULL;
3963#endif
3964 }
3965 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003966 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3967 (_PyUnicode_LENGTH(unicode) + 1));
3968 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003969 PyErr_NoMemory();
3970 return NULL;
3971 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003972 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3973 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3974 w = _PyUnicode_WSTR(unicode);
3975 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003976
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003977 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3978 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003979 for (; w < wchar_end; ++one_byte, ++w)
3980 *w = *one_byte;
3981 /* null-terminate the wstr */
3982 *w = 0;
3983 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003984 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003985#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003986 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 for (; w < wchar_end; ++two_bytes, ++w)
3988 *w = *two_bytes;
3989 /* null-terminate the wstr */
3990 *w = 0;
3991#else
3992 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003993 PyObject_FREE(_PyUnicode_WSTR(unicode));
3994 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995 Py_FatalError("Impossible unicode object state, wstr "
3996 "and str should share memory already.");
3997 return NULL;
3998#endif
3999 }
4000 else {
4001 assert(0 && "This should never happen.");
4002 }
4003 }
4004 }
4005 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004006 *size = PyUnicode_WSTR_LENGTH(unicode);
4007 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00004008}
4009
Alexander Belopolsky40018472011-02-26 01:02:56 +00004010Py_UNICODE *
4011PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004013 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014}
4015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016
Alexander Belopolsky40018472011-02-26 01:02:56 +00004017Py_ssize_t
4018PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004019{
4020 if (!PyUnicode_Check(unicode)) {
4021 PyErr_BadArgument();
4022 goto onError;
4023 }
4024 return PyUnicode_GET_SIZE(unicode);
4025
Benjamin Peterson29060642009-01-31 22:14:21 +00004026 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027 return -1;
4028}
4029
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030Py_ssize_t
4031PyUnicode_GetLength(PyObject *unicode)
4032{
Victor Stinner07621332012-06-16 04:53:46 +02004033 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004034 PyErr_BadArgument();
4035 return -1;
4036 }
Victor Stinner07621332012-06-16 04:53:46 +02004037 if (PyUnicode_READY(unicode) == -1)
4038 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004039 return PyUnicode_GET_LENGTH(unicode);
4040}
4041
4042Py_UCS4
4043PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4044{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004045 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4046 PyErr_BadArgument();
4047 return (Py_UCS4)-1;
4048 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004049 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004050 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 return (Py_UCS4)-1;
4052 }
4053 return PyUnicode_READ_CHAR(unicode, index);
4054}
4055
4056int
4057PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4058{
4059 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004060 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004061 return -1;
4062 }
Victor Stinner488fa492011-12-12 00:01:39 +01004063 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004064 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004065 PyErr_SetString(PyExc_IndexError, "string index out of range");
4066 return -1;
4067 }
Victor Stinner488fa492011-12-12 00:01:39 +01004068 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004069 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004070 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4071 PyErr_SetString(PyExc_ValueError, "character out of range");
4072 return -1;
4073 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004074 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4075 index, ch);
4076 return 0;
4077}
4078
Alexander Belopolsky40018472011-02-26 01:02:56 +00004079const char *
4080PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004081{
Victor Stinner42cb4622010-09-01 19:39:01 +00004082 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004083}
4084
Victor Stinner554f3f02010-06-16 23:33:54 +00004085/* create or adjust a UnicodeDecodeError */
4086static void
4087make_decode_exception(PyObject **exceptionObject,
4088 const char *encoding,
4089 const char *input, Py_ssize_t length,
4090 Py_ssize_t startpos, Py_ssize_t endpos,
4091 const char *reason)
4092{
4093 if (*exceptionObject == NULL) {
4094 *exceptionObject = PyUnicodeDecodeError_Create(
4095 encoding, input, length, startpos, endpos, reason);
4096 }
4097 else {
4098 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4099 goto onError;
4100 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4101 goto onError;
4102 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4103 goto onError;
4104 }
4105 return;
4106
4107onError:
4108 Py_DECREF(*exceptionObject);
4109 *exceptionObject = NULL;
4110}
4111
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004112/* error handling callback helper:
4113 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004114 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004115 and adjust various state variables.
4116 return 0 on success, -1 on error
4117*/
4118
Alexander Belopolsky40018472011-02-26 01:02:56 +00004119static int
4120unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004121 const char *encoding, const char *reason,
4122 const char **input, const char **inend, Py_ssize_t *startinpos,
4123 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004124 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004125{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004126 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004127
4128 PyObject *restuple = NULL;
4129 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004130 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004131 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004132 Py_ssize_t requiredsize;
4133 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004134 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135 int res = -1;
4136
Victor Stinner596a6c42011-11-09 00:02:18 +01004137 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4138 outsize = PyUnicode_GET_LENGTH(*output);
4139 else
4140 outsize = _PyUnicode_WSTR_LENGTH(*output);
4141
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004142 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004143 *errorHandler = PyCodec_LookupError(errors);
4144 if (*errorHandler == NULL)
4145 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146 }
4147
Victor Stinner554f3f02010-06-16 23:33:54 +00004148 make_decode_exception(exceptionObject,
4149 encoding,
4150 *input, *inend - *input,
4151 *startinpos, *endinpos,
4152 reason);
4153 if (*exceptionObject == NULL)
4154 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155
4156 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4157 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004158 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004160 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162 }
4163 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004164 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004165 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004166 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004167
4168 /* Copy back the bytes variables, which might have been modified by the
4169 callback */
4170 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4171 if (!inputobj)
4172 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004173 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004175 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004176 *input = PyBytes_AS_STRING(inputobj);
4177 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004178 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004179 /* we can DECREF safely, as the exception has another reference,
4180 so the object won't go away. */
4181 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004182
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004184 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004185 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004186 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4187 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004188 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004189
Victor Stinner596a6c42011-11-09 00:02:18 +01004190 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4191 /* need more space? (at least enough for what we
4192 have+the replacement+the rest of the string (starting
4193 at the new input position), so we won't have to check space
4194 when there are no errors in the rest of the string) */
4195 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4196 requiredsize = *outpos + replen + insize-newpos;
4197 if (requiredsize > outsize) {
4198 if (requiredsize<2*outsize)
4199 requiredsize = 2*outsize;
4200 if (unicode_resize(output, requiredsize) < 0)
4201 goto onError;
4202 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004203 if (unicode_widen(output, *outpos,
4204 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004205 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004206 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004207 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004208 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004209 else {
4210 wchar_t *repwstr;
4211 Py_ssize_t repwlen;
4212 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4213 if (repwstr == NULL)
4214 goto onError;
4215 /* need more space? (at least enough for what we
4216 have+the replacement+the rest of the string (starting
4217 at the new input position), so we won't have to check space
4218 when there are no errors in the rest of the string) */
4219 requiredsize = *outpos + repwlen + insize-newpos;
4220 if (requiredsize > outsize) {
4221 if (requiredsize < 2*outsize)
4222 requiredsize = 2*outsize;
4223 if (unicode_resize(output, requiredsize) < 0)
4224 goto onError;
4225 }
4226 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4227 *outpos += repwlen;
4228 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004229 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004230 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004231
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004232 /* we made it! */
4233 res = 0;
4234
Benjamin Peterson29060642009-01-31 22:14:21 +00004235 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004236 Py_XDECREF(restuple);
4237 return res;
4238}
4239
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004240/* --- UTF-7 Codec -------------------------------------------------------- */
4241
Antoine Pitrou244651a2009-05-04 18:56:13 +00004242/* See RFC2152 for details. We encode conservatively and decode liberally. */
4243
4244/* Three simple macros defining base-64. */
4245
4246/* Is c a base-64 character? */
4247
4248#define IS_BASE64(c) \
4249 (((c) >= 'A' && (c) <= 'Z') || \
4250 ((c) >= 'a' && (c) <= 'z') || \
4251 ((c) >= '0' && (c) <= '9') || \
4252 (c) == '+' || (c) == '/')
4253
4254/* given that c is a base-64 character, what is its base-64 value? */
4255
4256#define FROM_BASE64(c) \
4257 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4258 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4259 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4260 (c) == '+' ? 62 : 63)
4261
4262/* What is the base-64 character of the bottom 6 bits of n? */
4263
4264#define TO_BASE64(n) \
4265 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4266
4267/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4268 * decoded as itself. We are permissive on decoding; the only ASCII
4269 * byte not decoding to itself is the + which begins a base64
4270 * string. */
4271
4272#define DECODE_DIRECT(c) \
4273 ((c) <= 127 && (c) != '+')
4274
4275/* The UTF-7 encoder treats ASCII characters differently according to
4276 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4277 * the above). See RFC2152. This array identifies these different
4278 * sets:
4279 * 0 : "Set D"
4280 * alphanumeric and '(),-./:?
4281 * 1 : "Set O"
4282 * !"#$%&*;<=>@[]^_`{|}
4283 * 2 : "whitespace"
4284 * ht nl cr sp
4285 * 3 : special (must be base64 encoded)
4286 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4287 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004288
Tim Petersced69f82003-09-16 20:30:58 +00004289static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004290char utf7_category[128] = {
4291/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4292 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4293/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4294 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4295/* sp ! " # $ % & ' ( ) * + , - . / */
4296 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4297/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4298 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4299/* @ A B C D E F G H I J K L M N O */
4300 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4301/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4302 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4303/* ` a b c d e f g h i j k l m n o */
4304 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4305/* p q r s t u v w x y z { | } ~ del */
4306 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004307};
4308
Antoine Pitrou244651a2009-05-04 18:56:13 +00004309/* ENCODE_DIRECT: this character should be encoded as itself. The
4310 * answer depends on whether we are encoding set O as itself, and also
4311 * on whether we are encoding whitespace as itself. RFC2152 makes it
4312 * clear that the answers to these questions vary between
4313 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004314
Antoine Pitrou244651a2009-05-04 18:56:13 +00004315#define ENCODE_DIRECT(c, directO, directWS) \
4316 ((c) < 128 && (c) > 0 && \
4317 ((utf7_category[(c)] == 0) || \
4318 (directWS && (utf7_category[(c)] == 2)) || \
4319 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004320
Alexander Belopolsky40018472011-02-26 01:02:56 +00004321PyObject *
4322PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004323 Py_ssize_t size,
4324 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004325{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004326 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4327}
4328
Antoine Pitrou244651a2009-05-04 18:56:13 +00004329/* The decoder. The only state we preserve is our read position,
4330 * i.e. how many characters we have consumed. So if we end in the
4331 * middle of a shift sequence we have to back off the read position
4332 * and the output to the beginning of the sequence, otherwise we lose
4333 * all the shift state (seen bits, number of bits seen, high
4334 * surrogate). */
4335
Alexander Belopolsky40018472011-02-26 01:02:56 +00004336PyObject *
4337PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004338 Py_ssize_t size,
4339 const char *errors,
4340 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004341{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004342 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004343 Py_ssize_t startinpos;
4344 Py_ssize_t endinpos;
4345 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004346 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004347 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004348 const char *errmsg = "";
4349 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004350 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004351 unsigned int base64bits = 0;
4352 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004353 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004354 PyObject *errorHandler = NULL;
4355 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004356
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004357 /* Start off assuming it's all ASCII. Widen later as necessary. */
4358 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004359 if (!unicode)
4360 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004361 if (size == 0) {
4362 if (consumed)
4363 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004364 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004365 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004366
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004367 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004368 e = s + size;
4369
4370 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004371 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004372 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004373 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374
Antoine Pitrou244651a2009-05-04 18:56:13 +00004375 if (inShift) { /* in a base-64 section */
4376 if (IS_BASE64(ch)) { /* consume a base-64 character */
4377 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4378 base64bits += 6;
4379 s++;
4380 if (base64bits >= 16) {
4381 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004382 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004383 base64bits -= 16;
4384 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4385 if (surrogate) {
4386 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004387 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4388 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004389 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4390 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004391 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004392 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004393 }
4394 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004395 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4396 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004397 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004398 }
4399 }
Victor Stinner551ac952011-11-29 22:58:13 +01004400 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004401 /* first surrogate */
4402 surrogate = outCh;
4403 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004404 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004405 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4406 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004407 }
4408 }
4409 }
4410 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004411 inShift = 0;
4412 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004414 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4415 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004416 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418 if (base64bits > 0) { /* left-over bits */
4419 if (base64bits >= 6) {
4420 /* We've seen at least one base-64 character */
4421 errmsg = "partial character in shift sequence";
4422 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004423 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004424 else {
4425 /* Some bits remain; they should be zero */
4426 if (base64buffer != 0) {
4427 errmsg = "non-zero padding bits in shift sequence";
4428 goto utf7Error;
4429 }
4430 }
4431 }
4432 if (ch != '-') {
4433 /* '-' is absorbed; other terminating
4434 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004435 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4436 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438 }
4439 }
4440 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004442 s++; /* consume '+' */
4443 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004444 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004445 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4446 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 }
4448 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004449 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004450 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004451 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004452 }
4453 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004454 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004455 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4456 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004457 s++;
4458 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459 else {
4460 startinpos = s-starts;
4461 s++;
4462 errmsg = "unexpected special character";
4463 goto utf7Error;
4464 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004465 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004467 endinpos = s-starts;
4468 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004469 errors, &errorHandler,
4470 "utf7", errmsg,
4471 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004472 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004473 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004474 }
4475
Antoine Pitrou244651a2009-05-04 18:56:13 +00004476 /* end of string */
4477
4478 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4479 /* if we're in an inconsistent state, that's an error */
4480 if (surrogate ||
4481 (base64bits >= 6) ||
4482 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004483 endinpos = size;
4484 if (unicode_decode_call_errorhandler(
4485 errors, &errorHandler,
4486 "utf7", "unterminated shift sequence",
4487 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004488 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004489 goto onError;
4490 if (s < e)
4491 goto restart;
4492 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004494
4495 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004496 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004497 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004498 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004499 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500 }
4501 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004502 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004503 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004504 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004505
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004506 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004507 goto onError;
4508
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004509 Py_XDECREF(errorHandler);
4510 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004511 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004512
Benjamin Peterson29060642009-01-31 22:14:21 +00004513 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004514 Py_XDECREF(errorHandler);
4515 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516 Py_DECREF(unicode);
4517 return NULL;
4518}
4519
4520
Alexander Belopolsky40018472011-02-26 01:02:56 +00004521PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004522_PyUnicode_EncodeUTF7(PyObject *str,
4523 int base64SetO,
4524 int base64WhiteSpace,
4525 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004526{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004527 int kind;
4528 void *data;
4529 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004530 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004531 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004532 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004533 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004534 unsigned int base64bits = 0;
4535 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536 char * out;
4537 char * start;
4538
Benjamin Petersonbac79492012-01-14 13:34:47 -05004539 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004540 return NULL;
4541 kind = PyUnicode_KIND(str);
4542 data = PyUnicode_DATA(str);
4543 len = PyUnicode_GET_LENGTH(str);
4544
4545 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004546 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004547
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004548 /* It might be possible to tighten this worst case */
4549 allocated = 8 * len;
4550 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004551 return PyErr_NoMemory();
4552
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554 if (v == NULL)
4555 return NULL;
4556
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004557 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004558 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004559 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004560
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 if (inShift) {
4562 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4563 /* shifting out */
4564 if (base64bits) { /* output remaining bits */
4565 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4566 base64buffer = 0;
4567 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004568 }
4569 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 /* Characters not in the BASE64 set implicitly unshift the sequence
4571 so no '-' is required, except if the character is itself a '-' */
4572 if (IS_BASE64(ch) || ch == '-') {
4573 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004574 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004575 *out++ = (char) ch;
4576 }
4577 else {
4578 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004579 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004580 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 else { /* not in a shift sequence */
4582 if (ch == '+') {
4583 *out++ = '+';
4584 *out++ = '-';
4585 }
4586 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4587 *out++ = (char) ch;
4588 }
4589 else {
4590 *out++ = '+';
4591 inShift = 1;
4592 goto encode_char;
4593 }
4594 }
4595 continue;
4596encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004598 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004599
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 /* code first surrogate */
4601 base64bits += 16;
4602 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4603 while (base64bits >= 6) {
4604 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4605 base64bits -= 6;
4606 }
4607 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004608 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004609 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004610 base64bits += 16;
4611 base64buffer = (base64buffer << 16) | ch;
4612 while (base64bits >= 6) {
4613 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4614 base64bits -= 6;
4615 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004616 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004617 if (base64bits)
4618 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4619 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004620 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004621 if (_PyBytes_Resize(&v, out - start) < 0)
4622 return NULL;
4623 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004624}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004625PyObject *
4626PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4627 Py_ssize_t size,
4628 int base64SetO,
4629 int base64WhiteSpace,
4630 const char *errors)
4631{
4632 PyObject *result;
4633 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4634 if (tmp == NULL)
4635 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004636 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004637 base64WhiteSpace, errors);
4638 Py_DECREF(tmp);
4639 return result;
4640}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004641
Antoine Pitrou244651a2009-05-04 18:56:13 +00004642#undef IS_BASE64
4643#undef FROM_BASE64
4644#undef TO_BASE64
4645#undef DECODE_DIRECT
4646#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004647
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648/* --- UTF-8 Codec -------------------------------------------------------- */
4649
Alexander Belopolsky40018472011-02-26 01:02:56 +00004650PyObject *
4651PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004652 Py_ssize_t size,
4653 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004654{
Walter Dörwald69652032004-09-07 20:24:22 +00004655 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4656}
4657
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004658#include "stringlib/asciilib.h"
4659#include "stringlib/codecs.h"
4660#include "stringlib/undef.h"
4661
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004662#include "stringlib/ucs1lib.h"
4663#include "stringlib/codecs.h"
4664#include "stringlib/undef.h"
4665
4666#include "stringlib/ucs2lib.h"
4667#include "stringlib/codecs.h"
4668#include "stringlib/undef.h"
4669
4670#include "stringlib/ucs4lib.h"
4671#include "stringlib/codecs.h"
4672#include "stringlib/undef.h"
4673
Antoine Pitrouab868312009-01-10 15:40:25 +00004674/* Mask to quickly check whether a C 'long' contains a
4675 non-ASCII, UTF8-encoded char. */
4676#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004677# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004678#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004679# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004680#else
4681# error C 'long' size should be either 4 or 8!
4682#endif
4683
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004684static Py_ssize_t
4685ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004686{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004687 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004688 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004689
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004690#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004691 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4692 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004693 /* Fast path, see in STRINGLIB(utf8_decode) for
4694 an explanation. */
4695 /* Help register allocation */
4696 register const char *_p = p;
4697 register Py_UCS1 * q = dest;
4698 while (_p < aligned_end) {
4699 unsigned long value = *(const unsigned long *) _p;
4700 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004701 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004702 *((unsigned long *)q) = value;
4703 _p += SIZEOF_LONG;
4704 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004705 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004706 p = _p;
4707 while (p < end) {
4708 if ((unsigned char)*p & 0x80)
4709 break;
4710 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004712 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004714#endif
4715 while (p < end) {
4716 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4717 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004718 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004719 /* Help register allocation */
4720 register const char *_p = p;
4721 while (_p < aligned_end) {
4722 unsigned long value = *(unsigned long *) _p;
4723 if (value & ASCII_CHAR_MASK)
4724 break;
4725 _p += SIZEOF_LONG;
4726 }
4727 p = _p;
4728 if (_p == end)
4729 break;
4730 }
4731 if ((unsigned char)*p & 0x80)
4732 break;
4733 ++p;
4734 }
4735 memcpy(dest, start, p - start);
4736 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737}
Antoine Pitrouab868312009-01-10 15:40:25 +00004738
Victor Stinner785938e2011-12-11 20:09:03 +01004739PyObject *
4740PyUnicode_DecodeUTF8Stateful(const char *s,
4741 Py_ssize_t size,
4742 const char *errors,
4743 Py_ssize_t *consumed)
4744{
Victor Stinner785938e2011-12-11 20:09:03 +01004745 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004746 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004747 const char *end = s + size;
4748 Py_ssize_t outpos;
4749
4750 Py_ssize_t startinpos;
4751 Py_ssize_t endinpos;
4752 const char *errmsg = "";
4753 PyObject *errorHandler = NULL;
4754 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004755
4756 if (size == 0) {
4757 if (consumed)
4758 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004759 Py_INCREF(unicode_empty);
4760 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004761 }
4762
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004763 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4764 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004765 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004766 *consumed = 1;
4767 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004768 }
4769
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004771 if (!unicode)
4772 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004773
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004774 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4775 s += outpos;
4776 while (s < end) {
4777 Py_UCS4 ch;
4778 int kind = PyUnicode_KIND(unicode);
4779 if (kind == PyUnicode_1BYTE_KIND) {
4780 if (PyUnicode_IS_ASCII(unicode))
4781 ch = asciilib_utf8_decode(&s, end,
4782 PyUnicode_1BYTE_DATA(unicode), &outpos);
4783 else
4784 ch = ucs1lib_utf8_decode(&s, end,
4785 PyUnicode_1BYTE_DATA(unicode), &outpos);
4786 } else if (kind == PyUnicode_2BYTE_KIND) {
4787 ch = ucs2lib_utf8_decode(&s, end,
4788 PyUnicode_2BYTE_DATA(unicode), &outpos);
4789 } else {
4790 assert(kind == PyUnicode_4BYTE_KIND);
4791 ch = ucs4lib_utf8_decode(&s, end,
4792 PyUnicode_4BYTE_DATA(unicode), &outpos);
4793 }
4794
4795 switch (ch) {
4796 case 0:
4797 if (s == end || consumed)
4798 goto End;
4799 errmsg = "unexpected end of data";
4800 startinpos = s - starts;
4801 endinpos = startinpos + 1;
4802 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4803 endinpos++;
4804 break;
4805 case 1:
4806 errmsg = "invalid start byte";
4807 startinpos = s - starts;
4808 endinpos = startinpos + 1;
4809 break;
4810 case 2:
4811 errmsg = "invalid continuation byte";
4812 startinpos = s - starts;
4813 endinpos = startinpos + 1;
4814 while (endinpos < size && (starts[endinpos] & 0xC0) == 0x80)
4815 endinpos++;
4816 break;
4817 default:
4818 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4819 goto onError;
4820 continue;
4821 }
4822
4823 if (unicode_decode_call_errorhandler(
4824 errors, &errorHandler,
4825 "utf-8", errmsg,
4826 &starts, &end, &startinpos, &endinpos, &exc, &s,
4827 &unicode, &outpos))
4828 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004829 }
4830
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004831End:
4832 if (unicode_resize(&unicode, outpos) < 0)
4833 goto onError;
4834
4835 if (consumed)
4836 *consumed = s - starts;
4837
4838 Py_XDECREF(errorHandler);
4839 Py_XDECREF(exc);
4840 assert(_PyUnicode_CheckConsistency(unicode, 1));
4841 return unicode;
4842
4843onError:
4844 Py_XDECREF(errorHandler);
4845 Py_XDECREF(exc);
4846 Py_XDECREF(unicode);
4847 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004848}
4849
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004850#ifdef __APPLE__
4851
4852/* Simplified UTF-8 decoder using surrogateescape error handler,
4853 used to decode the command line arguments on Mac OS X. */
4854
4855wchar_t*
4856_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4857{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004858 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859 wchar_t *unicode;
4860 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004861
4862 /* Note: size will always be longer than the resulting Unicode
4863 character count */
4864 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4865 PyErr_NoMemory();
4866 return NULL;
4867 }
4868 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4869 if (!unicode)
4870 return NULL;
4871
4872 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004873 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004874 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004875 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004876 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004877#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004878 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004879#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004880 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004881#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882 if (ch > 0xFF) {
4883#if SIZEOF_WCHAR_T == 4
4884 assert(0);
4885#else
4886 assert(Py_UNICODE_IS_SURROGATE(ch));
4887 /* compute and append the two surrogates: */
4888 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4889 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4890#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004891 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004892 else {
4893 if (!ch && s == e)
4894 break;
4895 /* surrogateescape */
4896 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4897 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004898 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004899 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004900 return unicode;
4901}
4902
4903#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004905/* Primary internal function which creates utf8 encoded bytes objects.
4906
4907 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004908 and allocate exactly as much space needed at the end. Else allocate the
4909 maximum possible needed (4 result bytes per Unicode character), and return
4910 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004911*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004912PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004913_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914{
Victor Stinner6099a032011-12-18 14:22:26 +01004915 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004916 void *data;
4917 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004919 if (!PyUnicode_Check(unicode)) {
4920 PyErr_BadArgument();
4921 return NULL;
4922 }
4923
4924 if (PyUnicode_READY(unicode) == -1)
4925 return NULL;
4926
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004927 if (PyUnicode_UTF8(unicode))
4928 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4929 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004930
4931 kind = PyUnicode_KIND(unicode);
4932 data = PyUnicode_DATA(unicode);
4933 size = PyUnicode_GET_LENGTH(unicode);
4934
Benjamin Petersonead6b532011-12-20 17:23:42 -06004935 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004936 default:
4937 assert(0);
4938 case PyUnicode_1BYTE_KIND:
4939 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4940 assert(!PyUnicode_IS_ASCII(unicode));
4941 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4942 case PyUnicode_2BYTE_KIND:
4943 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4944 case PyUnicode_4BYTE_KIND:
4945 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004946 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947}
4948
Alexander Belopolsky40018472011-02-26 01:02:56 +00004949PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004950PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4951 Py_ssize_t size,
4952 const char *errors)
4953{
4954 PyObject *v, *unicode;
4955
4956 unicode = PyUnicode_FromUnicode(s, size);
4957 if (unicode == NULL)
4958 return NULL;
4959 v = _PyUnicode_AsUTF8String(unicode, errors);
4960 Py_DECREF(unicode);
4961 return v;
4962}
4963
4964PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004965PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004967 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004968}
4969
Walter Dörwald41980ca2007-08-16 21:55:45 +00004970/* --- UTF-32 Codec ------------------------------------------------------- */
4971
4972PyObject *
4973PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004974 Py_ssize_t size,
4975 const char *errors,
4976 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004977{
4978 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4979}
4980
4981PyObject *
4982PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004983 Py_ssize_t size,
4984 const char *errors,
4985 int *byteorder,
4986 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004987{
4988 const char *starts = s;
4989 Py_ssize_t startinpos;
4990 Py_ssize_t endinpos;
4991 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004992 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004993 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004994 int bo = 0; /* assume native ordering by default */
4995 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004996 /* Offsets from q for retrieving bytes in the right order. */
4997#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4998 int iorder[] = {0, 1, 2, 3};
4999#else
5000 int iorder[] = {3, 2, 1, 0};
5001#endif
5002 PyObject *errorHandler = NULL;
5003 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005004
Walter Dörwald41980ca2007-08-16 21:55:45 +00005005 q = (unsigned char *)s;
5006 e = q + size;
5007
5008 if (byteorder)
5009 bo = *byteorder;
5010
5011 /* Check for BOM marks (U+FEFF) in the input and adjust current
5012 byte order setting accordingly. In native mode, the leading BOM
5013 mark is skipped, in all other modes, it is copied to the output
5014 stream as-is (giving a ZWNBSP character). */
5015 if (bo == 0) {
5016 if (size >= 4) {
5017 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00005018 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005019#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 if (bom == 0x0000FEFF) {
5021 q += 4;
5022 bo = -1;
5023 }
5024 else if (bom == 0xFFFE0000) {
5025 q += 4;
5026 bo = 1;
5027 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005028#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 if (bom == 0x0000FEFF) {
5030 q += 4;
5031 bo = 1;
5032 }
5033 else if (bom == 0xFFFE0000) {
5034 q += 4;
5035 bo = -1;
5036 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005037#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005039 }
5040
5041 if (bo == -1) {
5042 /* force LE */
5043 iorder[0] = 0;
5044 iorder[1] = 1;
5045 iorder[2] = 2;
5046 iorder[3] = 3;
5047 }
5048 else if (bo == 1) {
5049 /* force BE */
5050 iorder[0] = 3;
5051 iorder[1] = 2;
5052 iorder[2] = 1;
5053 iorder[3] = 0;
5054 }
5055
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005056 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005057 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005058 if (!unicode)
5059 return NULL;
5060 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005061 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005062 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005063
Walter Dörwald41980ca2007-08-16 21:55:45 +00005064 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 Py_UCS4 ch;
5066 /* remaining bytes at the end? (size should be divisible by 4) */
5067 if (e-q<4) {
5068 if (consumed)
5069 break;
5070 errmsg = "truncated data";
5071 startinpos = ((const char *)q)-starts;
5072 endinpos = ((const char *)e)-starts;
5073 goto utf32Error;
5074 /* The remaining input chars are ignored if the callback
5075 chooses to skip the input */
5076 }
5077 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5078 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005079
Benjamin Peterson29060642009-01-31 22:14:21 +00005080 if (ch >= 0x110000)
5081 {
5082 errmsg = "codepoint not in range(0x110000)";
5083 startinpos = ((const char *)q)-starts;
5084 endinpos = startinpos+4;
5085 goto utf32Error;
5086 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005087 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5088 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005089 q += 4;
5090 continue;
5091 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005092 if (unicode_decode_call_errorhandler(
5093 errors, &errorHandler,
5094 "utf32", errmsg,
5095 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005096 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005097 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098 }
5099
5100 if (byteorder)
5101 *byteorder = bo;
5102
5103 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105
5106 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005107 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005108 goto onError;
5109
5110 Py_XDECREF(errorHandler);
5111 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005112 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005113
Benjamin Peterson29060642009-01-31 22:14:21 +00005114 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005115 Py_DECREF(unicode);
5116 Py_XDECREF(errorHandler);
5117 Py_XDECREF(exc);
5118 return NULL;
5119}
5120
5121PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005122_PyUnicode_EncodeUTF32(PyObject *str,
5123 const char *errors,
5124 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005125{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005126 int kind;
5127 void *data;
5128 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005129 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005130 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005131 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005132 /* Offsets from p for storing byte pairs in the right order. */
5133#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5134 int iorder[] = {0, 1, 2, 3};
5135#else
5136 int iorder[] = {3, 2, 1, 0};
5137#endif
5138
Benjamin Peterson29060642009-01-31 22:14:21 +00005139#define STORECHAR(CH) \
5140 do { \
5141 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5142 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5143 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5144 p[iorder[0]] = (CH) & 0xff; \
5145 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005146 } while(0)
5147
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005148 if (!PyUnicode_Check(str)) {
5149 PyErr_BadArgument();
5150 return NULL;
5151 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005152 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005153 return NULL;
5154 kind = PyUnicode_KIND(str);
5155 data = PyUnicode_DATA(str);
5156 len = PyUnicode_GET_LENGTH(str);
5157
5158 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005159 bytesize = nsize * 4;
5160 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005161 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005162 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005163 if (v == NULL)
5164 return NULL;
5165
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005166 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005167 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005168 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005169 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005170 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005171
5172 if (byteorder == -1) {
5173 /* force LE */
5174 iorder[0] = 0;
5175 iorder[1] = 1;
5176 iorder[2] = 2;
5177 iorder[3] = 3;
5178 }
5179 else if (byteorder == 1) {
5180 /* force BE */
5181 iorder[0] = 3;
5182 iorder[1] = 2;
5183 iorder[2] = 1;
5184 iorder[3] = 0;
5185 }
5186
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005187 for (i = 0; i < len; i++)
5188 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005189
5190 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005191 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005192#undef STORECHAR
5193}
5194
Alexander Belopolsky40018472011-02-26 01:02:56 +00005195PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005196PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5197 Py_ssize_t size,
5198 const char *errors,
5199 int byteorder)
5200{
5201 PyObject *result;
5202 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5203 if (tmp == NULL)
5204 return NULL;
5205 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5206 Py_DECREF(tmp);
5207 return result;
5208}
5209
5210PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005211PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005212{
Victor Stinnerb960b342011-11-20 19:12:52 +01005213 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005214}
5215
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216/* --- UTF-16 Codec ------------------------------------------------------- */
5217
Tim Peters772747b2001-08-09 22:21:55 +00005218PyObject *
5219PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 Py_ssize_t size,
5221 const char *errors,
5222 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223{
Walter Dörwald69652032004-09-07 20:24:22 +00005224 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5225}
5226
5227PyObject *
5228PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005229 Py_ssize_t size,
5230 const char *errors,
5231 int *byteorder,
5232 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005233{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005234 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005235 Py_ssize_t startinpos;
5236 Py_ssize_t endinpos;
5237 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005238 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005239 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005240 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005241 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005242 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005243 PyObject *errorHandler = NULL;
5244 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245
Tim Peters772747b2001-08-09 22:21:55 +00005246 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005247 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248
5249 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005250 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005252 /* Check for BOM marks (U+FEFF) in the input and adjust current
5253 byte order setting accordingly. In native mode, the leading BOM
5254 mark is skipped, in all other modes, it is copied to the output
5255 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005256 if (bo == 0 && size >= 2) {
5257 const Py_UCS4 bom = (q[1] << 8) | q[0];
5258 if (bom == 0xFEFF) {
5259 q += 2;
5260 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005262 else if (bom == 0xFFFE) {
5263 q += 2;
5264 bo = 1;
5265 }
5266 if (byteorder)
5267 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269
Antoine Pitrou63065d72012-05-15 23:48:04 +02005270 if (q == e) {
5271 if (consumed)
5272 *consumed = size;
5273 Py_INCREF(unicode_empty);
5274 return unicode_empty;
Tim Peters772747b2001-08-09 22:21:55 +00005275 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005276
Antoine Pitrouab868312009-01-10 15:40:25 +00005277#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005278 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005279#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005280 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005281#endif
Tim Peters772747b2001-08-09 22:21:55 +00005282
Antoine Pitrou63065d72012-05-15 23:48:04 +02005283 /* Note: size will always be longer than the resulting Unicode
5284 character count */
5285 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5286 if (!unicode)
5287 return NULL;
5288
5289 outpos = 0;
5290 while (1) {
5291 Py_UCS4 ch = 0;
5292 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005293 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005294 if (kind == PyUnicode_1BYTE_KIND) {
5295 if (PyUnicode_IS_ASCII(unicode))
5296 ch = asciilib_utf16_decode(&q, e,
5297 PyUnicode_1BYTE_DATA(unicode), &outpos,
5298 native_ordering);
5299 else
5300 ch = ucs1lib_utf16_decode(&q, e,
5301 PyUnicode_1BYTE_DATA(unicode), &outpos,
5302 native_ordering);
5303 } else if (kind == PyUnicode_2BYTE_KIND) {
5304 ch = ucs2lib_utf16_decode(&q, e,
5305 PyUnicode_2BYTE_DATA(unicode), &outpos,
5306 native_ordering);
5307 } else {
5308 assert(kind == PyUnicode_4BYTE_KIND);
5309 ch = ucs4lib_utf16_decode(&q, e,
5310 PyUnicode_4BYTE_DATA(unicode), &outpos,
5311 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005312 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005313 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005314
Antoine Pitrou63065d72012-05-15 23:48:04 +02005315 switch (ch)
5316 {
5317 case 0:
5318 /* remaining byte at the end? (size should be even) */
5319 if (q == e || consumed)
5320 goto End;
5321 errmsg = "truncated data";
5322 startinpos = ((const char *)q) - starts;
5323 endinpos = ((const char *)e) - starts;
5324 break;
5325 /* The remaining input chars are ignored if the callback
5326 chooses to skip the input */
5327 case 1:
5328 errmsg = "unexpected end of data";
5329 startinpos = ((const char *)q) - 2 - starts;
5330 endinpos = ((const char *)e) - starts;
5331 break;
5332 case 2:
5333 errmsg = "illegal encoding";
5334 startinpos = ((const char *)q) - 2 - starts;
5335 endinpos = startinpos + 2;
5336 break;
5337 case 3:
5338 errmsg = "illegal UTF-16 surrogate";
5339 startinpos = ((const char *)q) - 4 - starts;
5340 endinpos = startinpos + 2;
5341 break;
5342 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005343 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5344 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005345 continue;
5346 }
5347
Benjamin Peterson29060642009-01-31 22:14:21 +00005348 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005349 errors,
5350 &errorHandler,
5351 "utf16", errmsg,
5352 &starts,
5353 (const char **)&e,
5354 &startinpos,
5355 &endinpos,
5356 &exc,
5357 (const char **)&q,
5358 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005359 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005360 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 }
5362
Antoine Pitrou63065d72012-05-15 23:48:04 +02005363End:
Walter Dörwald69652032004-09-07 20:24:22 +00005364 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005366
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005368 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 goto onError;
5370
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005371 Py_XDECREF(errorHandler);
5372 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005373 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005377 Py_XDECREF(errorHandler);
5378 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005379 return NULL;
5380}
5381
Tim Peters772747b2001-08-09 22:21:55 +00005382PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005383_PyUnicode_EncodeUTF16(PyObject *str,
5384 const char *errors,
5385 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005387 enum PyUnicode_Kind kind;
5388 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005389 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005390 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005391 unsigned short *out;
5392 Py_ssize_t bytesize;
5393 Py_ssize_t pairs;
5394#ifdef WORDS_BIGENDIAN
5395 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005396#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005397 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005398#endif
5399
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005400 if (!PyUnicode_Check(str)) {
5401 PyErr_BadArgument();
5402 return NULL;
5403 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005404 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005405 return NULL;
5406 kind = PyUnicode_KIND(str);
5407 data = PyUnicode_DATA(str);
5408 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005409
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005410 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005411 if (kind == PyUnicode_4BYTE_KIND) {
5412 const Py_UCS4 *in = (const Py_UCS4 *)data;
5413 const Py_UCS4 *end = in + len;
5414 while (in < end)
5415 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005416 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005417 }
5418 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005419 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005420 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005421 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422 if (v == NULL)
5423 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005425 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005426 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005427 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005429 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005430 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005431 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005432
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005433 switch (kind) {
5434 case PyUnicode_1BYTE_KIND: {
5435 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5436 break;
Tim Peters772747b2001-08-09 22:21:55 +00005437 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005438 case PyUnicode_2BYTE_KIND: {
5439 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5440 break;
Tim Peters772747b2001-08-09 22:21:55 +00005441 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005442 case PyUnicode_4BYTE_KIND: {
5443 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5444 break;
5445 }
5446 default:
5447 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005448 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005449
5450 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005451 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452}
5453
Alexander Belopolsky40018472011-02-26 01:02:56 +00005454PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005455PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5456 Py_ssize_t size,
5457 const char *errors,
5458 int byteorder)
5459{
5460 PyObject *result;
5461 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5462 if (tmp == NULL)
5463 return NULL;
5464 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5465 Py_DECREF(tmp);
5466 return result;
5467}
5468
5469PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005470PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005472 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473}
5474
5475/* --- Unicode Escape Codec ----------------------------------------------- */
5476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005477/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5478 if all the escapes in the string make it still a valid ASCII string.
5479 Returns -1 if any escapes were found which cause the string to
5480 pop out of ASCII range. Otherwise returns the length of the
5481 required buffer to hold the string.
5482 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005483static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005484length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5485{
5486 const unsigned char *p = (const unsigned char *)s;
5487 const unsigned char *end = p + size;
5488 Py_ssize_t length = 0;
5489
5490 if (size < 0)
5491 return -1;
5492
5493 for (; p < end; ++p) {
5494 if (*p > 127) {
5495 /* Non-ASCII */
5496 return -1;
5497 }
5498 else if (*p != '\\') {
5499 /* Normal character */
5500 ++length;
5501 }
5502 else {
5503 /* Backslash-escape, check next char */
5504 ++p;
5505 /* Escape sequence reaches till end of string or
5506 non-ASCII follow-up. */
5507 if (p >= end || *p > 127)
5508 return -1;
5509 switch (*p) {
5510 case '\n':
5511 /* backslash + \n result in zero characters */
5512 break;
5513 case '\\': case '\'': case '\"':
5514 case 'b': case 'f': case 't':
5515 case 'n': case 'r': case 'v': case 'a':
5516 ++length;
5517 break;
5518 case '0': case '1': case '2': case '3':
5519 case '4': case '5': case '6': case '7':
5520 case 'x': case 'u': case 'U': case 'N':
5521 /* these do not guarantee ASCII characters */
5522 return -1;
5523 default:
5524 /* count the backslash + the other character */
5525 length += 2;
5526 }
5527 }
5528 }
5529 return length;
5530}
5531
Fredrik Lundh06d12682001-01-24 07:59:11 +00005532static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005533
Alexander Belopolsky40018472011-02-26 01:02:56 +00005534PyObject *
5535PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005536 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005537 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005539 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005540 Py_ssize_t startinpos;
5541 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005542 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005543 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005545 char* message;
5546 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005547 PyObject *errorHandler = NULL;
5548 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005549 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005550 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005551
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005552 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005553
5554 /* After length_of_escaped_ascii_string() there are two alternatives,
5555 either the string is pure ASCII with named escapes like \n, etc.
5556 and we determined it's exact size (common case)
5557 or it contains \x, \u, ... escape sequences. then we create a
5558 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005559 if (len >= 0) {
5560 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005561 if (!v)
5562 goto onError;
5563 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005564 }
5565 else {
5566 /* Escaped strings will always be longer than the resulting
5567 Unicode string, so we start with size here and then reduce the
5568 length after conversion to the true value.
5569 (but if the error callback returns a long replacement string
5570 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005571 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005572 if (!v)
5573 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005574 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005575 }
5576
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005578 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005579 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005580 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005581
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 while (s < end) {
5583 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005584 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005585 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005587 /* The only case in which i == ascii_length is a backslash
5588 followed by a newline. */
5589 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005590
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 /* Non-escape characters are interpreted as Unicode ordinals */
5592 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005593 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5594 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 continue;
5596 }
5597
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005598 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 /* \ - Escapes */
5600 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005601 c = *s++;
5602 if (s > end)
5603 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005604
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005605 /* The only case in which i == ascii_length is a backslash
5606 followed by a newline. */
5607 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005608
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005609 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610
Benjamin Peterson29060642009-01-31 22:14:21 +00005611 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005612#define WRITECHAR(ch) \
5613 do { \
5614 if (unicode_putchar(&v, &i, ch) < 0) \
5615 goto onError; \
5616 }while(0)
5617
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005619 case '\\': WRITECHAR('\\'); break;
5620 case '\'': WRITECHAR('\''); break;
5621 case '\"': WRITECHAR('\"'); break;
5622 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005623 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005624 case 'f': WRITECHAR('\014'); break;
5625 case 't': WRITECHAR('\t'); break;
5626 case 'n': WRITECHAR('\n'); break;
5627 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005628 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005629 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005630 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005631 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632
Benjamin Peterson29060642009-01-31 22:14:21 +00005633 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 case '0': case '1': case '2': case '3':
5635 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005636 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005637 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005638 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005639 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005640 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005642 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 break;
5644
Benjamin Peterson29060642009-01-31 22:14:21 +00005645 /* hex escapes */
5646 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005648 digits = 2;
5649 message = "truncated \\xXX escape";
5650 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651
Benjamin Peterson29060642009-01-31 22:14:21 +00005652 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005654 digits = 4;
5655 message = "truncated \\uXXXX escape";
5656 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657
Benjamin Peterson29060642009-01-31 22:14:21 +00005658 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005659 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005660 digits = 8;
5661 message = "truncated \\UXXXXXXXX escape";
5662 hexescape:
5663 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005664 if (s+digits>end) {
5665 endinpos = size;
5666 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005667 errors, &errorHandler,
5668 "unicodeescape", "end of string in escape sequence",
5669 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005670 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005671 goto onError;
5672 goto nextByte;
5673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005674 for (j = 0; j < digits; ++j) {
5675 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005676 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005677 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005678 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005679 errors, &errorHandler,
5680 "unicodeescape", message,
5681 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005682 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005683 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005684 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005685 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005686 }
5687 chr = (chr<<4) & ~0xF;
5688 if (c >= '0' && c <= '9')
5689 chr += c - '0';
5690 else if (c >= 'a' && c <= 'f')
5691 chr += 10 + c - 'a';
5692 else
5693 chr += 10 + c - 'A';
5694 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005695 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005696 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005697 /* _decoding_error will have already written into the
5698 target buffer. */
5699 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005700 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005701 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005702 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005703 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005704 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005706 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 errors, &errorHandler,
5708 "unicodeescape", "illegal Unicode character",
5709 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005710 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005711 goto onError;
5712 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005713 break;
5714
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005716 case 'N':
5717 message = "malformed \\N character escape";
5718 if (ucnhash_CAPI == NULL) {
5719 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005720 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5721 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005722 if (ucnhash_CAPI == NULL)
5723 goto ucnhashError;
5724 }
5725 if (*s == '{') {
5726 const char *start = s+1;
5727 /* look for the closing brace */
5728 while (*s != '}' && s < end)
5729 s++;
5730 if (s > start && s < end && *s == '}') {
5731 /* found a name. look it up in the unicode database */
5732 message = "unknown Unicode character name";
5733 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005734 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005735 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005736 goto store;
5737 }
5738 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005739 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005740 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 errors, &errorHandler,
5742 "unicodeescape", message,
5743 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005744 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005745 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005746 break;
5747
5748 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005749 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005750 message = "\\ at end of string";
5751 s--;
5752 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005753 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005754 errors, &errorHandler,
5755 "unicodeescape", message,
5756 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005757 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005758 goto onError;
5759 }
5760 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005761 WRITECHAR('\\');
5762 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005763 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005764 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005768 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005769#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005770
Victor Stinner16e6a802011-12-12 13:24:15 +01005771 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005772 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005773 Py_XDECREF(errorHandler);
5774 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005775 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005776
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005778 PyErr_SetString(
5779 PyExc_UnicodeError,
5780 "\\N escapes not supported (can't load unicodedata module)"
5781 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005782 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005783 Py_XDECREF(errorHandler);
5784 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005785 return NULL;
5786
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005789 Py_XDECREF(errorHandler);
5790 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791 return NULL;
5792}
5793
5794/* Return a Unicode-Escape string version of the Unicode object.
5795
5796 If quotes is true, the string is enclosed in u"" or u'' quotes as
5797 appropriate.
5798
5799*/
5800
Alexander Belopolsky40018472011-02-26 01:02:56 +00005801PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005802PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005804 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005805 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005807 int kind;
5808 void *data;
5809 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810
Ezio Melottie7f90372012-10-05 03:33:31 +03005811 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005812 escape.
5813
Ezio Melottie7f90372012-10-05 03:33:31 +03005814 For UCS1 strings it's '\xxx', 4 bytes per source character.
5815 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5816 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005817 */
5818
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005819 if (!PyUnicode_Check(unicode)) {
5820 PyErr_BadArgument();
5821 return NULL;
5822 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005823 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005824 return NULL;
5825 len = PyUnicode_GET_LENGTH(unicode);
5826 kind = PyUnicode_KIND(unicode);
5827 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005828 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005829 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5830 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5831 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5832 }
5833
5834 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005835 return PyBytes_FromStringAndSize(NULL, 0);
5836
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005837 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005839
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005840 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005842 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 if (repr == NULL)
5845 return NULL;
5846
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005847 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005849 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005850 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005851
Walter Dörwald79e913e2007-05-12 11:08:06 +00005852 /* Escape backslashes */
5853 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854 *p++ = '\\';
5855 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005856 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005857 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005858
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005859 /* Map 21-bit characters to '\U00xxxxxx' */
5860 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005861 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005862 *p++ = '\\';
5863 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005864 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5865 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5866 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5867 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5868 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5869 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5870 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5871 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005873 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005874
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005876 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877 *p++ = '\\';
5878 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005879 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5880 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5881 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5882 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005884
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005885 /* Map special whitespace to '\t', \n', '\r' */
5886 else if (ch == '\t') {
5887 *p++ = '\\';
5888 *p++ = 't';
5889 }
5890 else if (ch == '\n') {
5891 *p++ = '\\';
5892 *p++ = 'n';
5893 }
5894 else if (ch == '\r') {
5895 *p++ = '\\';
5896 *p++ = 'r';
5897 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005898
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005899 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005900 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005902 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005903 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5904 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005905 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005906
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 /* Copy everything else as-is */
5908 else
5909 *p++ = (char) ch;
5910 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005912 assert(p - PyBytes_AS_STRING(repr) > 0);
5913 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5914 return NULL;
5915 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916}
5917
Alexander Belopolsky40018472011-02-26 01:02:56 +00005918PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005919PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5920 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005922 PyObject *result;
5923 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5924 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005926 result = PyUnicode_AsUnicodeEscapeString(tmp);
5927 Py_DECREF(tmp);
5928 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929}
5930
5931/* --- Raw Unicode Escape Codec ------------------------------------------- */
5932
Alexander Belopolsky40018472011-02-26 01:02:56 +00005933PyObject *
5934PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005935 Py_ssize_t size,
5936 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005938 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005939 Py_ssize_t startinpos;
5940 Py_ssize_t endinpos;
5941 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005942 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 const char *end;
5944 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005945 PyObject *errorHandler = NULL;
5946 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005947
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948 /* Escaped strings will always be longer than the resulting
5949 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005950 length after conversion to the true value. (But decoding error
5951 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005952 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005956 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005957 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005958 end = s + size;
5959 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 unsigned char c;
5961 Py_UCS4 x;
5962 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005963 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005964
Benjamin Peterson29060642009-01-31 22:14:21 +00005965 /* Non-escape characters are interpreted as Unicode ordinals */
5966 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005967 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5968 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005970 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005971 startinpos = s-starts;
5972
5973 /* \u-escapes are only interpreted iff the number of leading
5974 backslashes if odd */
5975 bs = s;
5976 for (;s < end;) {
5977 if (*s != '\\')
5978 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005979 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5980 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 }
5982 if (((s - bs) & 1) == 0 ||
5983 s >= end ||
5984 (*s != 'u' && *s != 'U')) {
5985 continue;
5986 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005987 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 count = *s=='u' ? 4 : 8;
5989 s++;
5990
5991 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 for (x = 0, i = 0; i < count; ++i, ++s) {
5993 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005994 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005995 endinpos = s-starts;
5996 if (unicode_decode_call_errorhandler(
5997 errors, &errorHandler,
5998 "rawunicodeescape", "truncated \\uXXXX",
5999 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006000 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006001 goto onError;
6002 goto nextByte;
6003 }
6004 x = (x<<4) & ~0xF;
6005 if (c >= '0' && c <= '9')
6006 x += c - '0';
6007 else if (c >= 'a' && c <= 'f')
6008 x += 10 + c - 'a';
6009 else
6010 x += 10 + c - 'A';
6011 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006012 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006013 if (unicode_putchar(&v, &outpos, x) < 0)
6014 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006015 } else {
6016 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006017 if (unicode_decode_call_errorhandler(
6018 errors, &errorHandler,
6019 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006020 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006021 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006023 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 nextByte:
6025 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006026 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006027 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006029 Py_XDECREF(errorHandler);
6030 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006031 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006032
Benjamin Peterson29060642009-01-31 22:14:21 +00006033 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006035 Py_XDECREF(errorHandler);
6036 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006037 return NULL;
6038}
6039
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006040
Alexander Belopolsky40018472011-02-26 01:02:56 +00006041PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006042PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006044 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 char *p;
6046 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006047 Py_ssize_t expandsize, pos;
6048 int kind;
6049 void *data;
6050 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006052 if (!PyUnicode_Check(unicode)) {
6053 PyErr_BadArgument();
6054 return NULL;
6055 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006056 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006057 return NULL;
6058 kind = PyUnicode_KIND(unicode);
6059 data = PyUnicode_DATA(unicode);
6060 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006061 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6062 bytes, and 1 byte characters 4. */
6063 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006064
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006065 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006067
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006068 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 if (repr == NULL)
6070 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006071 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006072 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006074 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006075 for (pos = 0; pos < len; pos++) {
6076 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006077 /* Map 32-bit characters to '\Uxxxxxxxx' */
6078 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006079 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006080 *p++ = '\\';
6081 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006082 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6083 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6084 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6085 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6086 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6087 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6088 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6089 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006090 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006092 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093 *p++ = '\\';
6094 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006095 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6096 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6097 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6098 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006100 /* Copy everything else as-is */
6101 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 *p++ = (char) ch;
6103 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006104
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006105 assert(p > q);
6106 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006107 return NULL;
6108 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109}
6110
Alexander Belopolsky40018472011-02-26 01:02:56 +00006111PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006112PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6113 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006115 PyObject *result;
6116 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6117 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006118 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006119 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6120 Py_DECREF(tmp);
6121 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122}
6123
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006124/* --- Unicode Internal Codec ------------------------------------------- */
6125
Alexander Belopolsky40018472011-02-26 01:02:56 +00006126PyObject *
6127_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006128 Py_ssize_t size,
6129 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006130{
6131 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006132 Py_ssize_t startinpos;
6133 Py_ssize_t endinpos;
6134 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006135 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006136 const char *end;
6137 const char *reason;
6138 PyObject *errorHandler = NULL;
6139 PyObject *exc = NULL;
6140
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006141 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006142 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006143 1))
6144 return NULL;
6145
Thomas Wouters89f507f2006-12-13 04:49:30 +00006146 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006147 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006148 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006149 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006150 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006151 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006152 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006153 end = s + size;
6154
6155 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006156 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006157 Py_UCS4 ch;
6158 /* We copy the raw representation one byte at a time because the
6159 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006160 ((char *) &uch)[0] = s[0];
6161 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006162#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006163 ((char *) &uch)[2] = s[2];
6164 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006165#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006166 ch = uch;
6167
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006168 /* We have to sanity check the raw data, otherwise doom looms for
6169 some malformed UCS-4 data. */
6170 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006171#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006172 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006173#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006174 end-s < Py_UNICODE_SIZE
6175 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006177 startinpos = s - starts;
6178 if (end-s < Py_UNICODE_SIZE) {
6179 endinpos = end-starts;
6180 reason = "truncated input";
6181 }
6182 else {
6183 endinpos = s - starts + Py_UNICODE_SIZE;
6184 reason = "illegal code point (> 0x10FFFF)";
6185 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006186 if (unicode_decode_call_errorhandler(
6187 errors, &errorHandler,
6188 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006189 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006190 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006191 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006192 continue;
6193 }
6194
6195 s += Py_UNICODE_SIZE;
6196#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006197 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006198 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006199 Py_UNICODE uch2;
6200 ((char *) &uch2)[0] = s[0];
6201 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006202 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006203 {
Victor Stinner551ac952011-11-29 22:58:13 +01006204 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006205 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006206 }
6207 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006208#endif
6209
6210 if (unicode_putchar(&v, &outpos, ch) < 0)
6211 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006212 }
6213
Victor Stinner16e6a802011-12-12 13:24:15 +01006214 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006215 goto onError;
6216 Py_XDECREF(errorHandler);
6217 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006218 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006219
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006221 Py_XDECREF(v);
6222 Py_XDECREF(errorHandler);
6223 Py_XDECREF(exc);
6224 return NULL;
6225}
6226
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227/* --- Latin-1 Codec ------------------------------------------------------ */
6228
Alexander Belopolsky40018472011-02-26 01:02:56 +00006229PyObject *
6230PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006231 Py_ssize_t size,
6232 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006235 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236}
6237
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006238/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006239static void
6240make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006241 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006242 PyObject *unicode,
6243 Py_ssize_t startpos, Py_ssize_t endpos,
6244 const char *reason)
6245{
6246 if (*exceptionObject == NULL) {
6247 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006248 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006249 encoding, unicode, startpos, endpos, reason);
6250 }
6251 else {
6252 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6253 goto onError;
6254 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6255 goto onError;
6256 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6257 goto onError;
6258 return;
6259 onError:
6260 Py_DECREF(*exceptionObject);
6261 *exceptionObject = NULL;
6262 }
6263}
6264
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006265/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006266static void
6267raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006268 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006269 PyObject *unicode,
6270 Py_ssize_t startpos, Py_ssize_t endpos,
6271 const char *reason)
6272{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006273 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006274 encoding, unicode, startpos, endpos, reason);
6275 if (*exceptionObject != NULL)
6276 PyCodec_StrictErrors(*exceptionObject);
6277}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006278
6279/* error handling callback helper:
6280 build arguments, call the callback and check the arguments,
6281 put the result into newpos and return the replacement string, which
6282 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006283static PyObject *
6284unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006285 PyObject **errorHandler,
6286 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006287 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006288 Py_ssize_t startpos, Py_ssize_t endpos,
6289 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006290{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006291 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006292 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006293 PyObject *restuple;
6294 PyObject *resunicode;
6295
6296 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006298 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006300 }
6301
Benjamin Petersonbac79492012-01-14 13:34:47 -05006302 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006303 return NULL;
6304 len = PyUnicode_GET_LENGTH(unicode);
6305
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006306 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006307 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006308 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006309 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006310
6311 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006312 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006313 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006314 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006315 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006316 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006317 Py_DECREF(restuple);
6318 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006319 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006320 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006321 &resunicode, newpos)) {
6322 Py_DECREF(restuple);
6323 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006324 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006325 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6326 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6327 Py_DECREF(restuple);
6328 return NULL;
6329 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006330 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006331 *newpos = len + *newpos;
6332 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6334 Py_DECREF(restuple);
6335 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006336 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006337 Py_INCREF(resunicode);
6338 Py_DECREF(restuple);
6339 return resunicode;
6340}
6341
Alexander Belopolsky40018472011-02-26 01:02:56 +00006342static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006343unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006344 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006345 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006346{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006347 /* input state */
6348 Py_ssize_t pos=0, size;
6349 int kind;
6350 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006351 /* output object */
6352 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006353 /* pointer into the output */
6354 char *str;
6355 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006356 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006357 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6358 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006359 PyObject *errorHandler = NULL;
6360 PyObject *exc = NULL;
6361 /* the following variable is used for caching string comparisons
6362 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6363 int known_errorHandler = -1;
6364
Benjamin Petersonbac79492012-01-14 13:34:47 -05006365 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006366 return NULL;
6367 size = PyUnicode_GET_LENGTH(unicode);
6368 kind = PyUnicode_KIND(unicode);
6369 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006370 /* allocate enough for a simple encoding without
6371 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006372 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006373 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006374 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006375 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006376 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006377 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006378 ressize = size;
6379
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006380 while (pos < size) {
6381 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006382
Benjamin Peterson29060642009-01-31 22:14:21 +00006383 /* can we encode this? */
6384 if (c<limit) {
6385 /* no overflow check, because we know that the space is enough */
6386 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006387 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006388 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006390 Py_ssize_t requiredsize;
6391 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006392 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006393 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006394 Py_ssize_t collstart = pos;
6395 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006396 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006397 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 ++collend;
6399 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6400 if (known_errorHandler==-1) {
6401 if ((errors==NULL) || (!strcmp(errors, "strict")))
6402 known_errorHandler = 1;
6403 else if (!strcmp(errors, "replace"))
6404 known_errorHandler = 2;
6405 else if (!strcmp(errors, "ignore"))
6406 known_errorHandler = 3;
6407 else if (!strcmp(errors, "xmlcharrefreplace"))
6408 known_errorHandler = 4;
6409 else
6410 known_errorHandler = 0;
6411 }
6412 switch (known_errorHandler) {
6413 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006414 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 goto onError;
6416 case 2: /* replace */
6417 while (collstart++<collend)
6418 *str++ = '?'; /* fall through */
6419 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006420 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 break;
6422 case 4: /* xmlcharrefreplace */
6423 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006424 /* determine replacement size */
6425 for (i = collstart, repsize = 0; i < collend; ++i) {
6426 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6427 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006428 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006429 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006430 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006431 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006433 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006434 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006435 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006437 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006438 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006439 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006440 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006442 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006444 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 if (requiredsize > ressize) {
6446 if (requiredsize<2*ressize)
6447 requiredsize = 2*ressize;
6448 if (_PyBytes_Resize(&res, requiredsize))
6449 goto onError;
6450 str = PyBytes_AS_STRING(res) + respos;
6451 ressize = requiredsize;
6452 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006453 /* generate replacement */
6454 for (i = collstart; i < collend; ++i) {
6455 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006457 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 break;
6459 default:
6460 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006461 encoding, reason, unicode, &exc,
6462 collstart, collend, &newpos);
6463 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006464 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006465 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006466 if (PyBytes_Check(repunicode)) {
6467 /* Directly copy bytes result to output. */
6468 repsize = PyBytes_Size(repunicode);
6469 if (repsize > 1) {
6470 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006471 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006472 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6473 Py_DECREF(repunicode);
6474 goto onError;
6475 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006476 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006477 ressize += repsize-1;
6478 }
6479 memcpy(str, PyBytes_AsString(repunicode), repsize);
6480 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006481 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006482 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006483 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006484 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 /* need more space? (at least enough for what we
6486 have+the replacement+the rest of the string, so
6487 we won't have to check space for encodable characters) */
6488 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006489 repsize = PyUnicode_GET_LENGTH(repunicode);
6490 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006491 if (requiredsize > ressize) {
6492 if (requiredsize<2*ressize)
6493 requiredsize = 2*ressize;
6494 if (_PyBytes_Resize(&res, requiredsize)) {
6495 Py_DECREF(repunicode);
6496 goto onError;
6497 }
6498 str = PyBytes_AS_STRING(res) + respos;
6499 ressize = requiredsize;
6500 }
6501 /* check if there is anything unencodable in the replacement
6502 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006503 for (i = 0; repsize-->0; ++i, ++str) {
6504 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006506 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006507 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006508 Py_DECREF(repunicode);
6509 goto onError;
6510 }
6511 *str = (char)c;
6512 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006513 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006514 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006515 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006516 }
6517 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006518 /* Resize if we allocated to much */
6519 size = str - PyBytes_AS_STRING(res);
6520 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006521 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006522 if (_PyBytes_Resize(&res, size) < 0)
6523 goto onError;
6524 }
6525
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006526 Py_XDECREF(errorHandler);
6527 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006528 return res;
6529
6530 onError:
6531 Py_XDECREF(res);
6532 Py_XDECREF(errorHandler);
6533 Py_XDECREF(exc);
6534 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006535}
6536
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006537/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006538PyObject *
6539PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006540 Py_ssize_t size,
6541 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006543 PyObject *result;
6544 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6545 if (unicode == NULL)
6546 return NULL;
6547 result = unicode_encode_ucs1(unicode, errors, 256);
6548 Py_DECREF(unicode);
6549 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550}
6551
Alexander Belopolsky40018472011-02-26 01:02:56 +00006552PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006553_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554{
6555 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 PyErr_BadArgument();
6557 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006559 if (PyUnicode_READY(unicode) == -1)
6560 return NULL;
6561 /* Fast path: if it is a one-byte string, construct
6562 bytes object directly. */
6563 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6564 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6565 PyUnicode_GET_LENGTH(unicode));
6566 /* Non-Latin-1 characters present. Defer to above function to
6567 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006568 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006569}
6570
6571PyObject*
6572PyUnicode_AsLatin1String(PyObject *unicode)
6573{
6574 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575}
6576
6577/* --- 7-bit ASCII Codec -------------------------------------------------- */
6578
Alexander Belopolsky40018472011-02-26 01:02:56 +00006579PyObject *
6580PyUnicode_DecodeASCII(const char *s,
6581 Py_ssize_t size,
6582 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006584 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006585 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006586 int kind;
6587 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006588 Py_ssize_t startinpos;
6589 Py_ssize_t endinpos;
6590 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006591 const char *e;
6592 PyObject *errorHandler = NULL;
6593 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006594
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006595 if (size == 0) {
6596 Py_INCREF(unicode_empty);
6597 return unicode_empty;
6598 }
6599
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006601 if (size == 1 && (unsigned char)s[0] < 128)
6602 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006603
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006604 unicode = PyUnicode_New(size, 127);
6605 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006606 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006607
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006608 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006609 data = PyUnicode_1BYTE_DATA(unicode);
6610 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6611 if (outpos == size)
6612 return unicode;
6613
6614 s += outpos;
6615 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006616 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006617 register unsigned char c = (unsigned char)*s;
6618 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006619 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006620 ++s;
6621 }
6622 else {
6623 startinpos = s-starts;
6624 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 if (unicode_decode_call_errorhandler(
6626 errors, &errorHandler,
6627 "ascii", "ordinal not in range(128)",
6628 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006629 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006630 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006631 kind = PyUnicode_KIND(unicode);
6632 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006635 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006636 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006637 Py_XDECREF(errorHandler);
6638 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006639 assert(_PyUnicode_CheckConsistency(unicode, 1));
6640 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006641
Benjamin Peterson29060642009-01-31 22:14:21 +00006642 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006643 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644 Py_XDECREF(errorHandler);
6645 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 return NULL;
6647}
6648
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006650PyObject *
6651PyUnicode_EncodeASCII(const Py_UNICODE *p,
6652 Py_ssize_t size,
6653 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006655 PyObject *result;
6656 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6657 if (unicode == NULL)
6658 return NULL;
6659 result = unicode_encode_ucs1(unicode, errors, 128);
6660 Py_DECREF(unicode);
6661 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662}
6663
Alexander Belopolsky40018472011-02-26 01:02:56 +00006664PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006665_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666{
6667 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006668 PyErr_BadArgument();
6669 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006671 if (PyUnicode_READY(unicode) == -1)
6672 return NULL;
6673 /* Fast path: if it is an ASCII-only string, construct bytes object
6674 directly. Else defer to above function to raise the exception. */
6675 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6676 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6677 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006678 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006679}
6680
6681PyObject *
6682PyUnicode_AsASCIIString(PyObject *unicode)
6683{
6684 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006685}
6686
Victor Stinner99b95382011-07-04 14:23:54 +02006687#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006688
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006689/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006690
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006691#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006692#define NEED_RETRY
6693#endif
6694
Victor Stinner3a50e702011-10-18 21:21:00 +02006695#ifndef WC_ERR_INVALID_CHARS
6696# define WC_ERR_INVALID_CHARS 0x0080
6697#endif
6698
6699static char*
6700code_page_name(UINT code_page, PyObject **obj)
6701{
6702 *obj = NULL;
6703 if (code_page == CP_ACP)
6704 return "mbcs";
6705 if (code_page == CP_UTF7)
6706 return "CP_UTF7";
6707 if (code_page == CP_UTF8)
6708 return "CP_UTF8";
6709
6710 *obj = PyBytes_FromFormat("cp%u", code_page);
6711 if (*obj == NULL)
6712 return NULL;
6713 return PyBytes_AS_STRING(*obj);
6714}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006715
Alexander Belopolsky40018472011-02-26 01:02:56 +00006716static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006717is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006718{
6719 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006720 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006721
Victor Stinner3a50e702011-10-18 21:21:00 +02006722 if (!IsDBCSLeadByteEx(code_page, *curr))
6723 return 0;
6724
6725 prev = CharPrevExA(code_page, s, curr, 0);
6726 if (prev == curr)
6727 return 1;
6728 /* FIXME: This code is limited to "true" double-byte encodings,
6729 as it assumes an incomplete character consists of a single
6730 byte. */
6731 if (curr - prev == 2)
6732 return 1;
6733 if (!IsDBCSLeadByteEx(code_page, *prev))
6734 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006735 return 0;
6736}
6737
Victor Stinner3a50e702011-10-18 21:21:00 +02006738static DWORD
6739decode_code_page_flags(UINT code_page)
6740{
6741 if (code_page == CP_UTF7) {
6742 /* The CP_UTF7 decoder only supports flags=0 */
6743 return 0;
6744 }
6745 else
6746 return MB_ERR_INVALID_CHARS;
6747}
6748
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006749/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006750 * Decode a byte string from a Windows code page into unicode object in strict
6751 * mode.
6752 *
6753 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6754 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006755 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006756static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006757decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006758 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006759 const char *in,
6760 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006761{
Victor Stinner3a50e702011-10-18 21:21:00 +02006762 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006763 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006764 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006765
6766 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006767 assert(insize > 0);
6768 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6769 if (outsize <= 0)
6770 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006771
6772 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006774 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006775 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006776 if (*v == NULL)
6777 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006778 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006779 }
6780 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006782 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006783 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006784 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006785 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006786 }
6787
6788 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006789 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6790 if (outsize <= 0)
6791 goto error;
6792 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006793
Victor Stinner3a50e702011-10-18 21:21:00 +02006794error:
6795 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6796 return -2;
6797 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006798 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006799}
6800
Victor Stinner3a50e702011-10-18 21:21:00 +02006801/*
6802 * Decode a byte string from a code page into unicode object with an error
6803 * handler.
6804 *
6805 * Returns consumed size if succeed, or raise a WindowsError or
6806 * UnicodeDecodeError exception and returns -1 on error.
6807 */
6808static int
6809decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006810 PyObject **v,
6811 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006812 const char *errors)
6813{
6814 const char *startin = in;
6815 const char *endin = in + size;
6816 const DWORD flags = decode_code_page_flags(code_page);
6817 /* Ideally, we should get reason from FormatMessage. This is the Windows
6818 2000 English version of the message. */
6819 const char *reason = "No mapping for the Unicode character exists "
6820 "in the target code page.";
6821 /* each step cannot decode more than 1 character, but a character can be
6822 represented as a surrogate pair */
6823 wchar_t buffer[2], *startout, *out;
6824 int insize, outsize;
6825 PyObject *errorHandler = NULL;
6826 PyObject *exc = NULL;
6827 PyObject *encoding_obj = NULL;
6828 char *encoding;
6829 DWORD err;
6830 int ret = -1;
6831
6832 assert(size > 0);
6833
6834 encoding = code_page_name(code_page, &encoding_obj);
6835 if (encoding == NULL)
6836 return -1;
6837
6838 if (errors == NULL || strcmp(errors, "strict") == 0) {
6839 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6840 UnicodeDecodeError. */
6841 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6842 if (exc != NULL) {
6843 PyCodec_StrictErrors(exc);
6844 Py_CLEAR(exc);
6845 }
6846 goto error;
6847 }
6848
6849 if (*v == NULL) {
6850 /* Create unicode object */
6851 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6852 PyErr_NoMemory();
6853 goto error;
6854 }
Victor Stinnerab595942011-12-17 04:59:06 +01006855 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006856 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006857 if (*v == NULL)
6858 goto error;
6859 startout = PyUnicode_AS_UNICODE(*v);
6860 }
6861 else {
6862 /* Extend unicode object */
6863 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6864 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6865 PyErr_NoMemory();
6866 goto error;
6867 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006868 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006869 goto error;
6870 startout = PyUnicode_AS_UNICODE(*v) + n;
6871 }
6872
6873 /* Decode the byte string character per character */
6874 out = startout;
6875 while (in < endin)
6876 {
6877 /* Decode a character */
6878 insize = 1;
6879 do
6880 {
6881 outsize = MultiByteToWideChar(code_page, flags,
6882 in, insize,
6883 buffer, Py_ARRAY_LENGTH(buffer));
6884 if (outsize > 0)
6885 break;
6886 err = GetLastError();
6887 if (err != ERROR_NO_UNICODE_TRANSLATION
6888 && err != ERROR_INSUFFICIENT_BUFFER)
6889 {
6890 PyErr_SetFromWindowsErr(0);
6891 goto error;
6892 }
6893 insize++;
6894 }
6895 /* 4=maximum length of a UTF-8 sequence */
6896 while (insize <= 4 && (in + insize) <= endin);
6897
6898 if (outsize <= 0) {
6899 Py_ssize_t startinpos, endinpos, outpos;
6900
6901 startinpos = in - startin;
6902 endinpos = startinpos + 1;
6903 outpos = out - PyUnicode_AS_UNICODE(*v);
6904 if (unicode_decode_call_errorhandler(
6905 errors, &errorHandler,
6906 encoding, reason,
6907 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006908 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006909 {
6910 goto error;
6911 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006912 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006913 }
6914 else {
6915 in += insize;
6916 memcpy(out, buffer, outsize * sizeof(wchar_t));
6917 out += outsize;
6918 }
6919 }
6920
6921 /* write a NUL character at the end */
6922 *out = 0;
6923
6924 /* Extend unicode object */
6925 outsize = out - startout;
6926 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006927 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006928 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006929 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006930
6931error:
6932 Py_XDECREF(encoding_obj);
6933 Py_XDECREF(errorHandler);
6934 Py_XDECREF(exc);
6935 return ret;
6936}
6937
Victor Stinner3a50e702011-10-18 21:21:00 +02006938static PyObject *
6939decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006940 const char *s, Py_ssize_t size,
6941 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006942{
Victor Stinner76a31a62011-11-04 00:05:13 +01006943 PyObject *v = NULL;
6944 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006945
Victor Stinner3a50e702011-10-18 21:21:00 +02006946 if (code_page < 0) {
6947 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6948 return NULL;
6949 }
6950
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006951 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006952 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006953
Victor Stinner76a31a62011-11-04 00:05:13 +01006954 do
6955 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006956#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006957 if (size > INT_MAX) {
6958 chunk_size = INT_MAX;
6959 final = 0;
6960 done = 0;
6961 }
6962 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006963#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006964 {
6965 chunk_size = (int)size;
6966 final = (consumed == NULL);
6967 done = 1;
6968 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006969
Victor Stinner76a31a62011-11-04 00:05:13 +01006970 /* Skip trailing lead-byte unless 'final' is set */
6971 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6972 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006973
Victor Stinner76a31a62011-11-04 00:05:13 +01006974 if (chunk_size == 0 && done) {
6975 if (v != NULL)
6976 break;
6977 Py_INCREF(unicode_empty);
6978 return unicode_empty;
6979 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006980
Victor Stinner76a31a62011-11-04 00:05:13 +01006981
6982 converted = decode_code_page_strict(code_page, &v,
6983 s, chunk_size);
6984 if (converted == -2)
6985 converted = decode_code_page_errors(code_page, &v,
6986 s, chunk_size,
6987 errors);
6988 assert(converted != 0);
6989
6990 if (converted < 0) {
6991 Py_XDECREF(v);
6992 return NULL;
6993 }
6994
6995 if (consumed)
6996 *consumed += converted;
6997
6998 s += converted;
6999 size -= converted;
7000 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007001
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007002 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007003}
7004
Alexander Belopolsky40018472011-02-26 01:02:56 +00007005PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007006PyUnicode_DecodeCodePageStateful(int code_page,
7007 const char *s,
7008 Py_ssize_t size,
7009 const char *errors,
7010 Py_ssize_t *consumed)
7011{
7012 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7013}
7014
7015PyObject *
7016PyUnicode_DecodeMBCSStateful(const char *s,
7017 Py_ssize_t size,
7018 const char *errors,
7019 Py_ssize_t *consumed)
7020{
7021 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7022}
7023
7024PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007025PyUnicode_DecodeMBCS(const char *s,
7026 Py_ssize_t size,
7027 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007028{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007029 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7030}
7031
Victor Stinner3a50e702011-10-18 21:21:00 +02007032static DWORD
7033encode_code_page_flags(UINT code_page, const char *errors)
7034{
7035 if (code_page == CP_UTF8) {
7036 if (winver.dwMajorVersion >= 6)
7037 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7038 and later */
7039 return WC_ERR_INVALID_CHARS;
7040 else
7041 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7042 return 0;
7043 }
7044 else if (code_page == CP_UTF7) {
7045 /* CP_UTF7 only supports flags=0 */
7046 return 0;
7047 }
7048 else {
7049 if (errors != NULL && strcmp(errors, "replace") == 0)
7050 return 0;
7051 else
7052 return WC_NO_BEST_FIT_CHARS;
7053 }
7054}
7055
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007056/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007057 * Encode a Unicode string to a Windows code page into a byte string in strict
7058 * mode.
7059 *
7060 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7061 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007062 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007063static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007064encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007065 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007066 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007067{
Victor Stinner554f3f02010-06-16 23:33:54 +00007068 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007069 BOOL *pusedDefaultChar = &usedDefaultChar;
7070 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007071 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007072 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007073 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007074 const DWORD flags = encode_code_page_flags(code_page, NULL);
7075 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007076 /* Create a substring so that we can get the UTF-16 representation
7077 of just the slice under consideration. */
7078 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007079
Martin v. Löwis3d325192011-11-04 18:23:06 +01007080 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007081
Victor Stinner3a50e702011-10-18 21:21:00 +02007082 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007083 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007084 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007085 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007086
Victor Stinner2fc507f2011-11-04 20:06:39 +01007087 substring = PyUnicode_Substring(unicode, offset, offset+len);
7088 if (substring == NULL)
7089 return -1;
7090 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7091 if (p == NULL) {
7092 Py_DECREF(substring);
7093 return -1;
7094 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007095
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007096 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007097 outsize = WideCharToMultiByte(code_page, flags,
7098 p, size,
7099 NULL, 0,
7100 NULL, pusedDefaultChar);
7101 if (outsize <= 0)
7102 goto error;
7103 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007104 if (pusedDefaultChar && *pusedDefaultChar) {
7105 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007106 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007107 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007108
Victor Stinner3a50e702011-10-18 21:21:00 +02007109 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007110 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007111 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007112 if (*outbytes == NULL) {
7113 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007114 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007115 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007116 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117 }
7118 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007119 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007120 const Py_ssize_t n = PyBytes_Size(*outbytes);
7121 if (outsize > PY_SSIZE_T_MAX - n) {
7122 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007123 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007124 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007125 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007126 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7127 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007128 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007129 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007130 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007131 }
7132
7133 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007134 outsize = WideCharToMultiByte(code_page, flags,
7135 p, size,
7136 out, outsize,
7137 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007138 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 if (outsize <= 0)
7140 goto error;
7141 if (pusedDefaultChar && *pusedDefaultChar)
7142 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007143 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007144
Victor Stinner3a50e702011-10-18 21:21:00 +02007145error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007146 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007147 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7148 return -2;
7149 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007150 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007151}
7152
Victor Stinner3a50e702011-10-18 21:21:00 +02007153/*
7154 * Encode a Unicode string to a Windows code page into a byte string using a
7155 * error handler.
7156 *
7157 * Returns consumed characters if succeed, or raise a WindowsError and returns
7158 * -1 on other error.
7159 */
7160static int
7161encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007162 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007163 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007164{
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007166 Py_ssize_t pos = unicode_offset;
7167 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 /* Ideally, we should get reason from FormatMessage. This is the Windows
7169 2000 English version of the message. */
7170 const char *reason = "invalid character";
7171 /* 4=maximum length of a UTF-8 sequence */
7172 char buffer[4];
7173 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7174 Py_ssize_t outsize;
7175 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007176 PyObject *errorHandler = NULL;
7177 PyObject *exc = NULL;
7178 PyObject *encoding_obj = NULL;
7179 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007180 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007181 PyObject *rep;
7182 int ret = -1;
7183
7184 assert(insize > 0);
7185
7186 encoding = code_page_name(code_page, &encoding_obj);
7187 if (encoding == NULL)
7188 return -1;
7189
7190 if (errors == NULL || strcmp(errors, "strict") == 0) {
7191 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7192 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007193 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 if (exc != NULL) {
7195 PyCodec_StrictErrors(exc);
7196 Py_DECREF(exc);
7197 }
7198 Py_XDECREF(encoding_obj);
7199 return -1;
7200 }
7201
7202 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7203 pusedDefaultChar = &usedDefaultChar;
7204 else
7205 pusedDefaultChar = NULL;
7206
7207 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7208 PyErr_NoMemory();
7209 goto error;
7210 }
7211 outsize = insize * Py_ARRAY_LENGTH(buffer);
7212
7213 if (*outbytes == NULL) {
7214 /* Create string object */
7215 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7216 if (*outbytes == NULL)
7217 goto error;
7218 out = PyBytes_AS_STRING(*outbytes);
7219 }
7220 else {
7221 /* Extend string object */
7222 Py_ssize_t n = PyBytes_Size(*outbytes);
7223 if (n > PY_SSIZE_T_MAX - outsize) {
7224 PyErr_NoMemory();
7225 goto error;
7226 }
7227 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7228 goto error;
7229 out = PyBytes_AS_STRING(*outbytes) + n;
7230 }
7231
7232 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007233 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007235 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7236 wchar_t chars[2];
7237 int charsize;
7238 if (ch < 0x10000) {
7239 chars[0] = (wchar_t)ch;
7240 charsize = 1;
7241 }
7242 else {
7243 ch -= 0x10000;
7244 chars[0] = 0xd800 + (ch >> 10);
7245 chars[1] = 0xdc00 + (ch & 0x3ff);
7246 charsize = 2;
7247 }
7248
Victor Stinner3a50e702011-10-18 21:21:00 +02007249 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007250 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007251 buffer, Py_ARRAY_LENGTH(buffer),
7252 NULL, pusedDefaultChar);
7253 if (outsize > 0) {
7254 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7255 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007256 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007257 memcpy(out, buffer, outsize);
7258 out += outsize;
7259 continue;
7260 }
7261 }
7262 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7263 PyErr_SetFromWindowsErr(0);
7264 goto error;
7265 }
7266
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 rep = unicode_encode_call_errorhandler(
7268 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007269 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007270 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007271 if (rep == NULL)
7272 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007273 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007274
7275 if (PyBytes_Check(rep)) {
7276 outsize = PyBytes_GET_SIZE(rep);
7277 if (outsize != 1) {
7278 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7279 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7280 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7281 Py_DECREF(rep);
7282 goto error;
7283 }
7284 out = PyBytes_AS_STRING(*outbytes) + offset;
7285 }
7286 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7287 out += outsize;
7288 }
7289 else {
7290 Py_ssize_t i;
7291 enum PyUnicode_Kind kind;
7292 void *data;
7293
Benjamin Petersonbac79492012-01-14 13:34:47 -05007294 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007295 Py_DECREF(rep);
7296 goto error;
7297 }
7298
7299 outsize = PyUnicode_GET_LENGTH(rep);
7300 if (outsize != 1) {
7301 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7302 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7303 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7304 Py_DECREF(rep);
7305 goto error;
7306 }
7307 out = PyBytes_AS_STRING(*outbytes) + offset;
7308 }
7309 kind = PyUnicode_KIND(rep);
7310 data = PyUnicode_DATA(rep);
7311 for (i=0; i < outsize; i++) {
7312 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7313 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007314 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007315 encoding, unicode,
7316 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007317 "unable to encode error handler result to ASCII");
7318 Py_DECREF(rep);
7319 goto error;
7320 }
7321 *out = (unsigned char)ch;
7322 out++;
7323 }
7324 }
7325 Py_DECREF(rep);
7326 }
7327 /* write a NUL byte */
7328 *out = 0;
7329 outsize = out - PyBytes_AS_STRING(*outbytes);
7330 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7331 if (_PyBytes_Resize(outbytes, outsize) < 0)
7332 goto error;
7333 ret = 0;
7334
7335error:
7336 Py_XDECREF(encoding_obj);
7337 Py_XDECREF(errorHandler);
7338 Py_XDECREF(exc);
7339 return ret;
7340}
7341
Victor Stinner3a50e702011-10-18 21:21:00 +02007342static PyObject *
7343encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007344 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007345 const char *errors)
7346{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007347 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007348 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007349 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007350 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007351
Benjamin Petersonbac79492012-01-14 13:34:47 -05007352 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007353 return NULL;
7354 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007355
Victor Stinner3a50e702011-10-18 21:21:00 +02007356 if (code_page < 0) {
7357 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7358 return NULL;
7359 }
7360
Martin v. Löwis3d325192011-11-04 18:23:06 +01007361 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007362 return PyBytes_FromStringAndSize(NULL, 0);
7363
Victor Stinner7581cef2011-11-03 22:32:33 +01007364 offset = 0;
7365 do
7366 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007367#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007368 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007369 chunks. */
7370 if (len > INT_MAX/2) {
7371 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007372 done = 0;
7373 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007374 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007375#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007376 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007377 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007378 done = 1;
7379 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007380
Victor Stinner76a31a62011-11-04 00:05:13 +01007381 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007382 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007383 errors);
7384 if (ret == -2)
7385 ret = encode_code_page_errors(code_page, &outbytes,
7386 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007387 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007388 if (ret < 0) {
7389 Py_XDECREF(outbytes);
7390 return NULL;
7391 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007392
Victor Stinner7581cef2011-11-03 22:32:33 +01007393 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007394 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007395 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007396
Victor Stinner3a50e702011-10-18 21:21:00 +02007397 return outbytes;
7398}
7399
7400PyObject *
7401PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7402 Py_ssize_t size,
7403 const char *errors)
7404{
Victor Stinner7581cef2011-11-03 22:32:33 +01007405 PyObject *unicode, *res;
7406 unicode = PyUnicode_FromUnicode(p, size);
7407 if (unicode == NULL)
7408 return NULL;
7409 res = encode_code_page(CP_ACP, unicode, errors);
7410 Py_DECREF(unicode);
7411 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007412}
7413
7414PyObject *
7415PyUnicode_EncodeCodePage(int code_page,
7416 PyObject *unicode,
7417 const char *errors)
7418{
Victor Stinner7581cef2011-11-03 22:32:33 +01007419 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007420}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007421
Alexander Belopolsky40018472011-02-26 01:02:56 +00007422PyObject *
7423PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007424{
7425 if (!PyUnicode_Check(unicode)) {
7426 PyErr_BadArgument();
7427 return NULL;
7428 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007429 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007430}
7431
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007432#undef NEED_RETRY
7433
Victor Stinner99b95382011-07-04 14:23:54 +02007434#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007435
Guido van Rossumd57fd912000-03-10 22:53:23 +00007436/* --- Character Mapping Codec -------------------------------------------- */
7437
Alexander Belopolsky40018472011-02-26 01:02:56 +00007438PyObject *
7439PyUnicode_DecodeCharmap(const char *s,
7440 Py_ssize_t size,
7441 PyObject *mapping,
7442 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007443{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007444 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007445 Py_ssize_t startinpos;
7446 Py_ssize_t endinpos;
7447 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007448 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007449 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007450 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007451 PyObject *errorHandler = NULL;
7452 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007453
Guido van Rossumd57fd912000-03-10 22:53:23 +00007454 /* Default to Latin-1 */
7455 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007456 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007457
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007458 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007459 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007460 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007461 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007462 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007463 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007464 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007465 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007466 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007467 enum PyUnicode_Kind mapkind;
7468 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007469 Py_UCS4 x;
7470
Benjamin Petersonbac79492012-01-14 13:34:47 -05007471 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007472 return NULL;
7473
7474 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007475 mapdata = PyUnicode_DATA(mapping);
7476 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007477 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007478 unsigned char ch;
7479 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7480 enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7481 if (outkind == PyUnicode_1BYTE_KIND) {
7482 void *outdata = PyUnicode_DATA(v);
7483 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7484 while (s < e) {
7485 unsigned char ch = *s;
7486 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7487 if (x > maxchar)
7488 goto Error;
7489 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7490 ++s;
7491 }
7492 break;
7493 }
7494 else if (outkind == PyUnicode_2BYTE_KIND) {
7495 void *outdata = PyUnicode_DATA(v);
7496 while (s < e) {
7497 unsigned char ch = *s;
7498 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7499 if (x == 0xFFFE)
7500 goto Error;
7501 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7502 ++s;
7503 }
7504 break;
7505 }
7506 }
7507 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007508
Benjamin Peterson29060642009-01-31 22:14:21 +00007509 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007510 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007511 else
7512 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007513Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007514 if (x == 0xfffe)
7515 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007516 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 startinpos = s-starts;
7518 endinpos = startinpos+1;
7519 if (unicode_decode_call_errorhandler(
7520 errors, &errorHandler,
7521 "charmap", "character maps to <undefined>",
7522 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007523 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007524 goto onError;
7525 }
7526 continue;
7527 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007528
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007529 if (unicode_putchar(&v, &outpos, x) < 0)
7530 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007531 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007532 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007533 }
7534 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007535 while (s < e) {
7536 unsigned char ch = *s;
7537 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007538
Benjamin Peterson29060642009-01-31 22:14:21 +00007539 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7540 w = PyLong_FromLong((long)ch);
7541 if (w == NULL)
7542 goto onError;
7543 x = PyObject_GetItem(mapping, w);
7544 Py_DECREF(w);
7545 if (x == NULL) {
7546 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7547 /* No mapping found means: mapping is undefined. */
7548 PyErr_Clear();
7549 x = Py_None;
7550 Py_INCREF(x);
7551 } else
7552 goto onError;
7553 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007554
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 /* Apply mapping */
7556 if (PyLong_Check(x)) {
7557 long value = PyLong_AS_LONG(x);
Antoine Pitroua1f76552012-09-23 20:00:04 +02007558 if (value < 0 || value > MAX_UNICODE) {
7559 PyErr_Format(PyExc_TypeError,
7560 "character mapping must be in range(0x%lx)",
7561 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007562 Py_DECREF(x);
7563 goto onError;
7564 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007565 if (unicode_putchar(&v, &outpos, value) < 0)
7566 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007567 }
7568 else if (x == Py_None) {
7569 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 startinpos = s-starts;
7571 endinpos = startinpos+1;
7572 if (unicode_decode_call_errorhandler(
7573 errors, &errorHandler,
7574 "charmap", "character maps to <undefined>",
7575 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007576 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007577 Py_DECREF(x);
7578 goto onError;
7579 }
7580 Py_DECREF(x);
7581 continue;
7582 }
7583 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007584 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007585
Benjamin Petersonbac79492012-01-14 13:34:47 -05007586 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007587 goto onError;
7588 targetsize = PyUnicode_GET_LENGTH(x);
7589
7590 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007591 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007592 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007593 PyUnicode_READ_CHAR(x, 0)) < 0)
7594 goto onError;
7595 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 else if (targetsize > 1) {
7597 /* 1-n mapping */
7598 if (targetsize > extrachars) {
7599 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 Py_ssize_t needed = (targetsize - extrachars) + \
7601 (targetsize << 2);
7602 extrachars += needed;
7603 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007604 if (unicode_resize(&v,
7605 PyUnicode_GET_LENGTH(v) + needed) < 0)
7606 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 Py_DECREF(x);
7608 goto onError;
7609 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 }
Victor Stinner1b487b42012-05-03 12:29:04 +02007611 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007612 goto onError;
7613 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7614 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 extrachars -= targetsize;
7616 }
7617 /* 1-0 mapping: skip the character */
7618 }
7619 else {
7620 /* wrong return value */
7621 PyErr_SetString(PyExc_TypeError,
7622 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007623 Py_DECREF(x);
7624 goto onError;
7625 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 Py_DECREF(x);
7627 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007628 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007629 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007630 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007631 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007632 Py_XDECREF(errorHandler);
7633 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007634 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007635
Benjamin Peterson29060642009-01-31 22:14:21 +00007636 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007637 Py_XDECREF(errorHandler);
7638 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007639 Py_XDECREF(v);
7640 return NULL;
7641}
7642
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007643/* Charmap encoding: the lookup table */
7644
Alexander Belopolsky40018472011-02-26 01:02:56 +00007645struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007646 PyObject_HEAD
7647 unsigned char level1[32];
7648 int count2, count3;
7649 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007650};
7651
7652static PyObject*
7653encoding_map_size(PyObject *obj, PyObject* args)
7654{
7655 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007656 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007657 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007658}
7659
7660static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007661 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007662 PyDoc_STR("Return the size (in bytes) of this object") },
7663 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007664};
7665
7666static void
7667encoding_map_dealloc(PyObject* o)
7668{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007669 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007670}
7671
7672static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007673 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 "EncodingMap", /*tp_name*/
7675 sizeof(struct encoding_map), /*tp_basicsize*/
7676 0, /*tp_itemsize*/
7677 /* methods */
7678 encoding_map_dealloc, /*tp_dealloc*/
7679 0, /*tp_print*/
7680 0, /*tp_getattr*/
7681 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007682 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007683 0, /*tp_repr*/
7684 0, /*tp_as_number*/
7685 0, /*tp_as_sequence*/
7686 0, /*tp_as_mapping*/
7687 0, /*tp_hash*/
7688 0, /*tp_call*/
7689 0, /*tp_str*/
7690 0, /*tp_getattro*/
7691 0, /*tp_setattro*/
7692 0, /*tp_as_buffer*/
7693 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7694 0, /*tp_doc*/
7695 0, /*tp_traverse*/
7696 0, /*tp_clear*/
7697 0, /*tp_richcompare*/
7698 0, /*tp_weaklistoffset*/
7699 0, /*tp_iter*/
7700 0, /*tp_iternext*/
7701 encoding_map_methods, /*tp_methods*/
7702 0, /*tp_members*/
7703 0, /*tp_getset*/
7704 0, /*tp_base*/
7705 0, /*tp_dict*/
7706 0, /*tp_descr_get*/
7707 0, /*tp_descr_set*/
7708 0, /*tp_dictoffset*/
7709 0, /*tp_init*/
7710 0, /*tp_alloc*/
7711 0, /*tp_new*/
7712 0, /*tp_free*/
7713 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007714};
7715
7716PyObject*
7717PyUnicode_BuildEncodingMap(PyObject* string)
7718{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007719 PyObject *result;
7720 struct encoding_map *mresult;
7721 int i;
7722 int need_dict = 0;
7723 unsigned char level1[32];
7724 unsigned char level2[512];
7725 unsigned char *mlevel1, *mlevel2, *mlevel3;
7726 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007727 int kind;
7728 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007729 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007730 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007731
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007732 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007733 PyErr_BadArgument();
7734 return NULL;
7735 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007736 kind = PyUnicode_KIND(string);
7737 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007738 length = PyUnicode_GET_LENGTH(string);
7739 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007740 memset(level1, 0xFF, sizeof level1);
7741 memset(level2, 0xFF, sizeof level2);
7742
7743 /* If there isn't a one-to-one mapping of NULL to \0,
7744 or if there are non-BMP characters, we need to use
7745 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007746 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007747 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007748 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007749 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007750 ch = PyUnicode_READ(kind, data, i);
7751 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007752 need_dict = 1;
7753 break;
7754 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007755 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007756 /* unmapped character */
7757 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007758 l1 = ch >> 11;
7759 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007760 if (level1[l1] == 0xFF)
7761 level1[l1] = count2++;
7762 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007763 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007764 }
7765
7766 if (count2 >= 0xFF || count3 >= 0xFF)
7767 need_dict = 1;
7768
7769 if (need_dict) {
7770 PyObject *result = PyDict_New();
7771 PyObject *key, *value;
7772 if (!result)
7773 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007774 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007775 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007776 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007777 if (!key || !value)
7778 goto failed1;
7779 if (PyDict_SetItem(result, key, value) == -1)
7780 goto failed1;
7781 Py_DECREF(key);
7782 Py_DECREF(value);
7783 }
7784 return result;
7785 failed1:
7786 Py_XDECREF(key);
7787 Py_XDECREF(value);
7788 Py_DECREF(result);
7789 return NULL;
7790 }
7791
7792 /* Create a three-level trie */
7793 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7794 16*count2 + 128*count3 - 1);
7795 if (!result)
7796 return PyErr_NoMemory();
7797 PyObject_Init(result, &EncodingMapType);
7798 mresult = (struct encoding_map*)result;
7799 mresult->count2 = count2;
7800 mresult->count3 = count3;
7801 mlevel1 = mresult->level1;
7802 mlevel2 = mresult->level23;
7803 mlevel3 = mresult->level23 + 16*count2;
7804 memcpy(mlevel1, level1, 32);
7805 memset(mlevel2, 0xFF, 16*count2);
7806 memset(mlevel3, 0, 128*count3);
7807 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007808 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007809 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007810 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7811 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007812 /* unmapped character */
7813 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007814 o1 = ch>>11;
7815 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007816 i2 = 16*mlevel1[o1] + o2;
7817 if (mlevel2[i2] == 0xFF)
7818 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007819 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007820 i3 = 128*mlevel2[i2] + o3;
7821 mlevel3[i3] = i;
7822 }
7823 return result;
7824}
7825
7826static int
Victor Stinner22168992011-11-20 17:09:18 +01007827encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007828{
7829 struct encoding_map *map = (struct encoding_map*)mapping;
7830 int l1 = c>>11;
7831 int l2 = (c>>7) & 0xF;
7832 int l3 = c & 0x7F;
7833 int i;
7834
Victor Stinner22168992011-11-20 17:09:18 +01007835 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007837 if (c == 0)
7838 return 0;
7839 /* level 1*/
7840 i = map->level1[l1];
7841 if (i == 0xFF) {
7842 return -1;
7843 }
7844 /* level 2*/
7845 i = map->level23[16*i+l2];
7846 if (i == 0xFF) {
7847 return -1;
7848 }
7849 /* level 3 */
7850 i = map->level23[16*map->count2 + 128*i + l3];
7851 if (i == 0) {
7852 return -1;
7853 }
7854 return i;
7855}
7856
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007857/* Lookup the character ch in the mapping. If the character
7858 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007859 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007860static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007861charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007862{
Christian Heimes217cfd12007-12-02 14:31:20 +00007863 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007864 PyObject *x;
7865
7866 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007867 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007868 x = PyObject_GetItem(mapping, w);
7869 Py_DECREF(w);
7870 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007871 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7872 /* No mapping found means: mapping is undefined. */
7873 PyErr_Clear();
7874 x = Py_None;
7875 Py_INCREF(x);
7876 return x;
7877 } else
7878 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007880 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007881 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007882 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007883 long value = PyLong_AS_LONG(x);
7884 if (value < 0 || value > 255) {
7885 PyErr_SetString(PyExc_TypeError,
7886 "character mapping must be in range(256)");
7887 Py_DECREF(x);
7888 return NULL;
7889 }
7890 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007891 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007892 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007893 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007894 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 /* wrong return value */
7896 PyErr_Format(PyExc_TypeError,
7897 "character mapping must return integer, bytes or None, not %.400s",
7898 x->ob_type->tp_name);
7899 Py_DECREF(x);
7900 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901 }
7902}
7903
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007904static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007905charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007906{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007907 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7908 /* exponentially overallocate to minimize reallocations */
7909 if (requiredsize < 2*outsize)
7910 requiredsize = 2*outsize;
7911 if (_PyBytes_Resize(outobj, requiredsize))
7912 return -1;
7913 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007914}
7915
Benjamin Peterson14339b62009-01-31 16:36:08 +00007916typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007917 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007918} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007919/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007920 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007921 space is available. Return a new reference to the object that
7922 was put in the output buffer, or Py_None, if the mapping was undefined
7923 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007924 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007925static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007926charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007927 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007928{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007929 PyObject *rep;
7930 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007931 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007932
Christian Heimes90aa7642007-12-19 02:45:37 +00007933 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007934 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007935 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007936 if (res == -1)
7937 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007938 if (outsize<requiredsize)
7939 if (charmapencode_resize(outobj, outpos, requiredsize))
7940 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007941 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007942 outstart[(*outpos)++] = (char)res;
7943 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007944 }
7945
7946 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007947 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007948 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007949 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007950 Py_DECREF(rep);
7951 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007952 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 if (PyLong_Check(rep)) {
7954 Py_ssize_t requiredsize = *outpos+1;
7955 if (outsize<requiredsize)
7956 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7957 Py_DECREF(rep);
7958 return enc_EXCEPTION;
7959 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007960 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007962 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007963 else {
7964 const char *repchars = PyBytes_AS_STRING(rep);
7965 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7966 Py_ssize_t requiredsize = *outpos+repsize;
7967 if (outsize<requiredsize)
7968 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7969 Py_DECREF(rep);
7970 return enc_EXCEPTION;
7971 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007972 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 memcpy(outstart + *outpos, repchars, repsize);
7974 *outpos += repsize;
7975 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007976 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007977 Py_DECREF(rep);
7978 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007979}
7980
7981/* handle an error in PyUnicode_EncodeCharmap
7982 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007983static int
7984charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007985 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007986 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007987 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007988 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007989{
7990 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007991 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007992 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007993 enum PyUnicode_Kind kind;
7994 void *data;
7995 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007996 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007997 Py_ssize_t collstartpos = *inpos;
7998 Py_ssize_t collendpos = *inpos+1;
7999 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008000 char *encoding = "charmap";
8001 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008002 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008003 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008004 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008005
Benjamin Petersonbac79492012-01-14 13:34:47 -05008006 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008007 return -1;
8008 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008009 /* find all unencodable characters */
8010 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008011 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008012 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008013 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008014 val = encoding_map_lookup(ch, mapping);
8015 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008016 break;
8017 ++collendpos;
8018 continue;
8019 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008020
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008021 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8022 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 if (rep==NULL)
8024 return -1;
8025 else if (rep!=Py_None) {
8026 Py_DECREF(rep);
8027 break;
8028 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008029 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008031 }
8032 /* cache callback name lookup
8033 * (if not done yet, i.e. it's the first error) */
8034 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 if ((errors==NULL) || (!strcmp(errors, "strict")))
8036 *known_errorHandler = 1;
8037 else if (!strcmp(errors, "replace"))
8038 *known_errorHandler = 2;
8039 else if (!strcmp(errors, "ignore"))
8040 *known_errorHandler = 3;
8041 else if (!strcmp(errors, "xmlcharrefreplace"))
8042 *known_errorHandler = 4;
8043 else
8044 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008045 }
8046 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008047 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008048 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008049 return -1;
8050 case 2: /* replace */
8051 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008052 x = charmapencode_output('?', mapping, res, respos);
8053 if (x==enc_EXCEPTION) {
8054 return -1;
8055 }
8056 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008057 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008058 return -1;
8059 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008060 }
8061 /* fall through */
8062 case 3: /* ignore */
8063 *inpos = collendpos;
8064 break;
8065 case 4: /* xmlcharrefreplace */
8066 /* generate replacement (temporarily (mis)uses p) */
8067 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 char buffer[2+29+1+1];
8069 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008070 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 for (cp = buffer; *cp; ++cp) {
8072 x = charmapencode_output(*cp, mapping, res, respos);
8073 if (x==enc_EXCEPTION)
8074 return -1;
8075 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008076 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008077 return -1;
8078 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008079 }
8080 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008081 *inpos = collendpos;
8082 break;
8083 default:
8084 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008085 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008087 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008089 if (PyBytes_Check(repunicode)) {
8090 /* Directly copy bytes result to output. */
8091 Py_ssize_t outsize = PyBytes_Size(*res);
8092 Py_ssize_t requiredsize;
8093 repsize = PyBytes_Size(repunicode);
8094 requiredsize = *respos + repsize;
8095 if (requiredsize > outsize)
8096 /* Make room for all additional bytes. */
8097 if (charmapencode_resize(res, respos, requiredsize)) {
8098 Py_DECREF(repunicode);
8099 return -1;
8100 }
8101 memcpy(PyBytes_AsString(*res) + *respos,
8102 PyBytes_AsString(repunicode), repsize);
8103 *respos += repsize;
8104 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008105 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008106 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008107 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008108 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008109 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008110 Py_DECREF(repunicode);
8111 return -1;
8112 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008113 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008114 data = PyUnicode_DATA(repunicode);
8115 kind = PyUnicode_KIND(repunicode);
8116 for (index = 0; index < repsize; index++) {
8117 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8118 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008120 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 return -1;
8122 }
8123 else if (x==enc_FAILED) {
8124 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008125 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008126 return -1;
8127 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008128 }
8129 *inpos = newpos;
8130 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008131 }
8132 return 0;
8133}
8134
Alexander Belopolsky40018472011-02-26 01:02:56 +00008135PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008136_PyUnicode_EncodeCharmap(PyObject *unicode,
8137 PyObject *mapping,
8138 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008139{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 /* output object */
8141 PyObject *res = NULL;
8142 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008143 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008144 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008145 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008146 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008147 PyObject *errorHandler = NULL;
8148 PyObject *exc = NULL;
8149 /* the following variable is used for caching string comparisons
8150 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8151 * 3=ignore, 4=xmlcharrefreplace */
8152 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153
Benjamin Petersonbac79492012-01-14 13:34:47 -05008154 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008155 return NULL;
8156 size = PyUnicode_GET_LENGTH(unicode);
8157
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 /* Default to Latin-1 */
8159 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008160 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008162 /* allocate enough for a simple encoding without
8163 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008164 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008165 if (res == NULL)
8166 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008167 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008169
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008170 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008171 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008173 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008174 if (x==enc_EXCEPTION) /* error */
8175 goto onError;
8176 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008177 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 &exc,
8179 &known_errorHandler, &errorHandler, errors,
8180 &res, &respos)) {
8181 goto onError;
8182 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008183 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 else
8185 /* done with this character => adjust input position */
8186 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008189 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008190 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008191 if (_PyBytes_Resize(&res, respos) < 0)
8192 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008193
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008194 Py_XDECREF(exc);
8195 Py_XDECREF(errorHandler);
8196 return res;
8197
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008199 Py_XDECREF(res);
8200 Py_XDECREF(exc);
8201 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008202 return NULL;
8203}
8204
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008205/* Deprecated */
8206PyObject *
8207PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8208 Py_ssize_t size,
8209 PyObject *mapping,
8210 const char *errors)
8211{
8212 PyObject *result;
8213 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8214 if (unicode == NULL)
8215 return NULL;
8216 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8217 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008218 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008219}
8220
Alexander Belopolsky40018472011-02-26 01:02:56 +00008221PyObject *
8222PyUnicode_AsCharmapString(PyObject *unicode,
8223 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008224{
8225 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 PyErr_BadArgument();
8227 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008228 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008229 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008230}
8231
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008232/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008233static void
8234make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008236 Py_ssize_t startpos, Py_ssize_t endpos,
8237 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008238{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008239 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008240 *exceptionObject = _PyUnicodeTranslateError_Create(
8241 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242 }
8243 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8245 goto onError;
8246 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8247 goto onError;
8248 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8249 goto onError;
8250 return;
8251 onError:
8252 Py_DECREF(*exceptionObject);
8253 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008254 }
8255}
8256
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008258static void
8259raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008260 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008261 Py_ssize_t startpos, Py_ssize_t endpos,
8262 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008263{
8264 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008265 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008266 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008267 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008268}
8269
8270/* error handling callback helper:
8271 build arguments, call the callback and check the arguments,
8272 put the result into newpos and return the replacement string, which
8273 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008274static PyObject *
8275unicode_translate_call_errorhandler(const char *errors,
8276 PyObject **errorHandler,
8277 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008278 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008279 Py_ssize_t startpos, Py_ssize_t endpos,
8280 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008281{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008282 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008283
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008284 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008285 PyObject *restuple;
8286 PyObject *resunicode;
8287
8288 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008289 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008290 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008291 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008292 }
8293
8294 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008295 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008296 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008298
8299 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008301 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008302 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008303 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008304 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 Py_DECREF(restuple);
8306 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008307 }
8308 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 &resunicode, &i_newpos)) {
8310 Py_DECREF(restuple);
8311 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008312 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008313 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008314 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008315 else
8316 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008317 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8319 Py_DECREF(restuple);
8320 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008321 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008322 Py_INCREF(resunicode);
8323 Py_DECREF(restuple);
8324 return resunicode;
8325}
8326
8327/* Lookup the character ch in the mapping and put the result in result,
8328 which must be decrefed by the caller.
8329 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008330static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008331charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008332{
Christian Heimes217cfd12007-12-02 14:31:20 +00008333 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008334 PyObject *x;
8335
8336 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008338 x = PyObject_GetItem(mapping, w);
8339 Py_DECREF(w);
8340 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8342 /* No mapping found means: use 1:1 mapping. */
8343 PyErr_Clear();
8344 *result = NULL;
8345 return 0;
8346 } else
8347 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008348 }
8349 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008350 *result = x;
8351 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008353 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008354 long value = PyLong_AS_LONG(x);
8355 long max = PyUnicode_GetMax();
8356 if (value < 0 || value > max) {
8357 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008358 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 Py_DECREF(x);
8360 return -1;
8361 }
8362 *result = x;
8363 return 0;
8364 }
8365 else if (PyUnicode_Check(x)) {
8366 *result = x;
8367 return 0;
8368 }
8369 else {
8370 /* wrong return value */
8371 PyErr_SetString(PyExc_TypeError,
8372 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008373 Py_DECREF(x);
8374 return -1;
8375 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376}
8377/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 if not reallocate and adjust various state variables.
8379 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008380static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008384 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008385 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008386 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 /* exponentially overallocate to minimize reallocations */
8388 if (requiredsize < 2 * oldsize)
8389 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008390 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8391 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008393 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008394 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395 }
8396 return 0;
8397}
8398/* lookup the character, put the result in the output string and adjust
8399 various state variables. Return a new reference to the object that
8400 was put in the output buffer in *result, or Py_None, if the mapping was
8401 undefined (in which case no character was written).
8402 The called must decref result.
8403 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008404static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008405charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8406 PyObject *mapping, Py_UCS4 **output,
8407 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008408 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008409{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008410 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8411 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416 }
8417 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008418 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008419 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008421 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 }
8423 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008424 Py_ssize_t repsize;
8425 if (PyUnicode_READY(*res) == -1)
8426 return -1;
8427 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 if (repsize==1) {
8429 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008430 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 }
8432 else if (repsize!=0) {
8433 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008434 Py_ssize_t requiredsize = *opos +
8435 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008437 Py_ssize_t i;
8438 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008440 for(i = 0; i < repsize; i++)
8441 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008443 }
8444 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008446 return 0;
8447}
8448
Alexander Belopolsky40018472011-02-26 01:02:56 +00008449PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008450_PyUnicode_TranslateCharmap(PyObject *input,
8451 PyObject *mapping,
8452 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 /* input object */
8455 char *idata;
8456 Py_ssize_t size, i;
8457 int kind;
8458 /* output buffer */
8459 Py_UCS4 *output = NULL;
8460 Py_ssize_t osize;
8461 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008462 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008463 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008464 char *reason = "character maps to <undefined>";
8465 PyObject *errorHandler = NULL;
8466 PyObject *exc = NULL;
8467 /* the following variable is used for caching string comparisons
8468 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8469 * 3=ignore, 4=xmlcharrefreplace */
8470 int known_errorHandler = -1;
8471
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 PyErr_BadArgument();
8474 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008475 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008477 if (PyUnicode_READY(input) == -1)
8478 return NULL;
8479 idata = (char*)PyUnicode_DATA(input);
8480 kind = PyUnicode_KIND(input);
8481 size = PyUnicode_GET_LENGTH(input);
8482 i = 0;
8483
8484 if (size == 0) {
8485 Py_INCREF(input);
8486 return input;
8487 }
8488
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489 /* allocate enough for a simple 1:1 translation without
8490 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008491 osize = size;
8492 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8493 opos = 0;
8494 if (output == NULL) {
8495 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008497 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008499 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 /* try to encode it */
8501 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 if (charmaptranslate_output(input, i, mapping,
8503 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 Py_XDECREF(x);
8505 goto onError;
8506 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008507 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 else { /* untranslatable character */
8511 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8512 Py_ssize_t repsize;
8513 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008516 Py_ssize_t collstart = i;
8517 Py_ssize_t collend = i+1;
8518 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008519
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 while (collend < size) {
8522 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 goto onError;
8524 Py_XDECREF(x);
8525 if (x!=Py_None)
8526 break;
8527 ++collend;
8528 }
8529 /* cache callback name lookup
8530 * (if not done yet, i.e. it's the first error) */
8531 if (known_errorHandler==-1) {
8532 if ((errors==NULL) || (!strcmp(errors, "strict")))
8533 known_errorHandler = 1;
8534 else if (!strcmp(errors, "replace"))
8535 known_errorHandler = 2;
8536 else if (!strcmp(errors, "ignore"))
8537 known_errorHandler = 3;
8538 else if (!strcmp(errors, "xmlcharrefreplace"))
8539 known_errorHandler = 4;
8540 else
8541 known_errorHandler = 0;
8542 }
8543 switch (known_errorHandler) {
8544 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008545 raise_translate_exception(&exc, input, collstart,
8546 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008547 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008548 case 2: /* replace */
8549 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008550 for (coll = collstart; coll<collend; coll++)
8551 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 /* fall through */
8553 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 break;
8556 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557 /* generate replacement (temporarily (mis)uses i) */
8558 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 char buffer[2+29+1+1];
8560 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008561 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8562 if (charmaptranslate_makespace(&output, &osize,
8563 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 goto onError;
8565 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008568 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 break;
8570 default:
8571 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 reason, input, &exc,
8573 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008574 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008576 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008577 Py_DECREF(repunicode);
8578 goto onError;
8579 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008581 repsize = PyUnicode_GET_LENGTH(repunicode);
8582 if (charmaptranslate_makespace(&output, &osize,
8583 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 Py_DECREF(repunicode);
8585 goto onError;
8586 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008587 for (uni2 = 0; repsize-->0; ++uni2)
8588 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8589 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008590 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008591 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008592 }
8593 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008594 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8595 if (!res)
8596 goto onError;
8597 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008598 Py_XDECREF(exc);
8599 Py_XDECREF(errorHandler);
8600 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008601
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008604 Py_XDECREF(exc);
8605 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606 return NULL;
8607}
8608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609/* Deprecated. Use PyUnicode_Translate instead. */
8610PyObject *
8611PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8612 Py_ssize_t size,
8613 PyObject *mapping,
8614 const char *errors)
8615{
Christian Heimes5f520f42012-09-11 14:03:25 +02008616 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008617 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8618 if (!unicode)
8619 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008620 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8621 Py_DECREF(unicode);
8622 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623}
8624
Alexander Belopolsky40018472011-02-26 01:02:56 +00008625PyObject *
8626PyUnicode_Translate(PyObject *str,
8627 PyObject *mapping,
8628 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629{
8630 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008631
Guido van Rossumd57fd912000-03-10 22:53:23 +00008632 str = PyUnicode_FromObject(str);
8633 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008634 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636 Py_DECREF(str);
8637 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008638}
Tim Petersced69f82003-09-16 20:30:58 +00008639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008640static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008641fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008642{
8643 /* No need to call PyUnicode_READY(self) because this function is only
8644 called as a callback from fixup() which does it already. */
8645 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8646 const int kind = PyUnicode_KIND(self);
8647 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008648 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008649 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650 Py_ssize_t i;
8651
8652 for (i = 0; i < len; ++i) {
8653 ch = PyUnicode_READ(kind, data, i);
8654 fixed = 0;
8655 if (ch > 127) {
8656 if (Py_UNICODE_ISSPACE(ch))
8657 fixed = ' ';
8658 else {
8659 const int decimal = Py_UNICODE_TODECIMAL(ch);
8660 if (decimal >= 0)
8661 fixed = '0' + decimal;
8662 }
8663 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008664 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008665 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 PyUnicode_WRITE(kind, data, i, fixed);
8667 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008668 else
8669 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 }
8672
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008673 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674}
8675
8676PyObject *
8677_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8678{
8679 if (!PyUnicode_Check(unicode)) {
8680 PyErr_BadInternalCall();
8681 return NULL;
8682 }
8683 if (PyUnicode_READY(unicode) == -1)
8684 return NULL;
8685 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8686 /* If the string is already ASCII, just return the same string */
8687 Py_INCREF(unicode);
8688 return unicode;
8689 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008690 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008691}
8692
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008693PyObject *
8694PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8695 Py_ssize_t length)
8696{
Victor Stinnerf0124502011-11-21 23:12:56 +01008697 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008698 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008699 Py_UCS4 maxchar;
8700 enum PyUnicode_Kind kind;
8701 void *data;
8702
Victor Stinner99d7ad02012-02-22 13:37:39 +01008703 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008704 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008705 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008706 if (ch > 127) {
8707 int decimal = Py_UNICODE_TODECIMAL(ch);
8708 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008709 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008710 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008711 }
8712 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008713
8714 /* Copy to a new string */
8715 decimal = PyUnicode_New(length, maxchar);
8716 if (decimal == NULL)
8717 return decimal;
8718 kind = PyUnicode_KIND(decimal);
8719 data = PyUnicode_DATA(decimal);
8720 /* Iterate over code points */
8721 for (i = 0; i < length; i++) {
8722 Py_UNICODE ch = s[i];
8723 if (ch > 127) {
8724 int decimal = Py_UNICODE_TODECIMAL(ch);
8725 if (decimal >= 0)
8726 ch = '0' + decimal;
8727 }
8728 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008730 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008731}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008732/* --- Decimal Encoder ---------------------------------------------------- */
8733
Alexander Belopolsky40018472011-02-26 01:02:56 +00008734int
8735PyUnicode_EncodeDecimal(Py_UNICODE *s,
8736 Py_ssize_t length,
8737 char *output,
8738 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008739{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008740 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008741 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008742 enum PyUnicode_Kind kind;
8743 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008744
8745 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 PyErr_BadArgument();
8747 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008748 }
8749
Victor Stinner42bf7752011-11-21 22:52:58 +01008750 unicode = PyUnicode_FromUnicode(s, length);
8751 if (unicode == NULL)
8752 return -1;
8753
Benjamin Petersonbac79492012-01-14 13:34:47 -05008754 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008755 Py_DECREF(unicode);
8756 return -1;
8757 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008758 kind = PyUnicode_KIND(unicode);
8759 data = PyUnicode_DATA(unicode);
8760
Victor Stinnerb84d7232011-11-22 01:50:07 +01008761 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008762 PyObject *exc;
8763 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008765 Py_ssize_t startpos;
8766
8767 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008768
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008770 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008771 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008773 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 decimal = Py_UNICODE_TODECIMAL(ch);
8775 if (decimal >= 0) {
8776 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008777 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008778 continue;
8779 }
8780 if (0 < ch && ch < 256) {
8781 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008782 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008783 continue;
8784 }
Victor Stinner6345be92011-11-25 20:09:01 +01008785
Victor Stinner42bf7752011-11-21 22:52:58 +01008786 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008787 exc = NULL;
8788 raise_encode_exception(&exc, "decimal", unicode,
8789 startpos, startpos+1,
8790 "invalid decimal Unicode string");
8791 Py_XDECREF(exc);
8792 Py_DECREF(unicode);
8793 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008794 }
8795 /* 0-terminate the output string */
8796 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008797 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008798 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008799}
8800
Guido van Rossumd57fd912000-03-10 22:53:23 +00008801/* --- Helpers ------------------------------------------------------------ */
8802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008804any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008805 Py_ssize_t start,
8806 Py_ssize_t end)
8807{
8808 int kind1, kind2, kind;
8809 void *buf1, *buf2;
8810 Py_ssize_t len1, len2, result;
8811
8812 kind1 = PyUnicode_KIND(s1);
8813 kind2 = PyUnicode_KIND(s2);
8814 kind = kind1 > kind2 ? kind1 : kind2;
8815 buf1 = PyUnicode_DATA(s1);
8816 buf2 = PyUnicode_DATA(s2);
8817 if (kind1 != kind)
8818 buf1 = _PyUnicode_AsKind(s1, kind);
8819 if (!buf1)
8820 return -2;
8821 if (kind2 != kind)
8822 buf2 = _PyUnicode_AsKind(s2, kind);
8823 if (!buf2) {
8824 if (kind1 != kind) PyMem_Free(buf1);
8825 return -2;
8826 }
8827 len1 = PyUnicode_GET_LENGTH(s1);
8828 len2 = PyUnicode_GET_LENGTH(s2);
8829
Victor Stinner794d5672011-10-10 03:21:36 +02008830 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008831 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008832 case PyUnicode_1BYTE_KIND:
8833 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8834 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8835 else
8836 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8837 break;
8838 case PyUnicode_2BYTE_KIND:
8839 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8840 break;
8841 case PyUnicode_4BYTE_KIND:
8842 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8843 break;
8844 default:
8845 assert(0); result = -2;
8846 }
8847 }
8848 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008849 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008850 case PyUnicode_1BYTE_KIND:
8851 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8852 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8853 else
8854 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8855 break;
8856 case PyUnicode_2BYTE_KIND:
8857 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8858 break;
8859 case PyUnicode_4BYTE_KIND:
8860 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8861 break;
8862 default:
8863 assert(0); result = -2;
8864 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008865 }
8866
8867 if (kind1 != kind)
8868 PyMem_Free(buf1);
8869 if (kind2 != kind)
8870 PyMem_Free(buf2);
8871
8872 return result;
8873}
8874
8875Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008876_PyUnicode_InsertThousandsGrouping(
8877 PyObject *unicode, Py_ssize_t index,
8878 Py_ssize_t n_buffer,
8879 void *digits, Py_ssize_t n_digits,
8880 Py_ssize_t min_width,
8881 const char *grouping, PyObject *thousands_sep,
8882 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883{
Victor Stinner41a863c2012-02-24 00:37:51 +01008884 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008885 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008886 Py_ssize_t thousands_sep_len;
8887 Py_ssize_t len;
8888
8889 if (unicode != NULL) {
8890 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008891 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008892 }
8893 else {
8894 kind = PyUnicode_1BYTE_KIND;
8895 data = NULL;
8896 }
8897 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8898 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8899 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8900 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008901 if (thousands_sep_kind < kind) {
8902 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8903 if (!thousands_sep_data)
8904 return -1;
8905 }
8906 else {
8907 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8908 if (!data)
8909 return -1;
8910 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008911 }
8912
Benjamin Petersonead6b532011-12-20 17:23:42 -06008913 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008914 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008915 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008916 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008917 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008918 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008919 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008920 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008921 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008922 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008923 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008924 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008925 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008926 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008927 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008928 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008929 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008930 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008931 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008932 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008933 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008934 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008935 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008936 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008937 break;
8938 default:
8939 assert(0);
8940 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008941 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008942 if (unicode != NULL && thousands_sep_kind != kind) {
8943 if (thousands_sep_kind < kind)
8944 PyMem_Free(thousands_sep_data);
8945 else
8946 PyMem_Free(data);
8947 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008948 if (unicode == NULL) {
8949 *maxchar = 127;
8950 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008951 *maxchar = MAX_MAXCHAR(*maxchar,
8952 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008953 }
8954 }
8955 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956}
8957
8958
Thomas Wouters477c8d52006-05-27 19:21:47 +00008959/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008960#define ADJUST_INDICES(start, end, len) \
8961 if (end > len) \
8962 end = len; \
8963 else if (end < 0) { \
8964 end += len; \
8965 if (end < 0) \
8966 end = 0; \
8967 } \
8968 if (start < 0) { \
8969 start += len; \
8970 if (start < 0) \
8971 start = 0; \
8972 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008973
Alexander Belopolsky40018472011-02-26 01:02:56 +00008974Py_ssize_t
8975PyUnicode_Count(PyObject *str,
8976 PyObject *substr,
8977 Py_ssize_t start,
8978 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008979{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008980 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008981 PyObject* str_obj;
8982 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008983 int kind1, kind2, kind;
8984 void *buf1 = NULL, *buf2 = NULL;
8985 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008986
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008987 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008988 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008989 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008990 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008991 if (!sub_obj) {
8992 Py_DECREF(str_obj);
8993 return -1;
8994 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008995 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008996 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008997 Py_DECREF(str_obj);
8998 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999 }
Tim Petersced69f82003-09-16 20:30:58 +00009000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001 kind1 = PyUnicode_KIND(str_obj);
9002 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009003 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009005 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009006 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02009007 if (kind2 > kind) {
9008 Py_DECREF(sub_obj);
9009 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009010 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02009011 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01009012 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 if (!buf2)
9015 goto onError;
9016 len1 = PyUnicode_GET_LENGTH(str_obj);
9017 len2 = PyUnicode_GET_LENGTH(sub_obj);
9018
9019 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009020 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009021 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009022 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9023 result = asciilib_count(
9024 ((Py_UCS1*)buf1) + start, end - start,
9025 buf2, len2, PY_SSIZE_T_MAX
9026 );
9027 else
9028 result = ucs1lib_count(
9029 ((Py_UCS1*)buf1) + start, end - start,
9030 buf2, len2, PY_SSIZE_T_MAX
9031 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009032 break;
9033 case PyUnicode_2BYTE_KIND:
9034 result = ucs2lib_count(
9035 ((Py_UCS2*)buf1) + start, end - start,
9036 buf2, len2, PY_SSIZE_T_MAX
9037 );
9038 break;
9039 case PyUnicode_4BYTE_KIND:
9040 result = ucs4lib_count(
9041 ((Py_UCS4*)buf1) + start, end - start,
9042 buf2, len2, PY_SSIZE_T_MAX
9043 );
9044 break;
9045 default:
9046 assert(0); result = 0;
9047 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009048
9049 Py_DECREF(sub_obj);
9050 Py_DECREF(str_obj);
9051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009052 if (kind2 != kind)
9053 PyMem_Free(buf2);
9054
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056 onError:
9057 Py_DECREF(sub_obj);
9058 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059 if (kind2 != kind && buf2)
9060 PyMem_Free(buf2);
9061 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009062}
9063
Alexander Belopolsky40018472011-02-26 01:02:56 +00009064Py_ssize_t
9065PyUnicode_Find(PyObject *str,
9066 PyObject *sub,
9067 Py_ssize_t start,
9068 Py_ssize_t end,
9069 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009070{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009071 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009072
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009074 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009075 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009076 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009077 if (!sub) {
9078 Py_DECREF(str);
9079 return -2;
9080 }
9081 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9082 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009083 Py_DECREF(str);
9084 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085 }
Tim Petersced69f82003-09-16 20:30:58 +00009086
Victor Stinner794d5672011-10-10 03:21:36 +02009087 result = any_find_slice(direction,
9088 str, sub, start, end
9089 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009090
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009092 Py_DECREF(sub);
9093
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094 return result;
9095}
9096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009097Py_ssize_t
9098PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9099 Py_ssize_t start, Py_ssize_t end,
9100 int direction)
9101{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009103 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104 if (PyUnicode_READY(str) == -1)
9105 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009106 if (start < 0 || end < 0) {
9107 PyErr_SetString(PyExc_IndexError, "string index out of range");
9108 return -2;
9109 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009110 if (end > PyUnicode_GET_LENGTH(str))
9111 end = PyUnicode_GET_LENGTH(str);
9112 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009113 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9114 kind, end-start, ch, direction);
9115 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009116 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009117 else
9118 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119}
9120
Alexander Belopolsky40018472011-02-26 01:02:56 +00009121static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009122tailmatch(PyObject *self,
9123 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009124 Py_ssize_t start,
9125 Py_ssize_t end,
9126 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009127{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009128 int kind_self;
9129 int kind_sub;
9130 void *data_self;
9131 void *data_sub;
9132 Py_ssize_t offset;
9133 Py_ssize_t i;
9134 Py_ssize_t end_sub;
9135
9136 if (PyUnicode_READY(self) == -1 ||
9137 PyUnicode_READY(substring) == -1)
9138 return 0;
9139
9140 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141 return 1;
9142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009143 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9144 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009146 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009148 kind_self = PyUnicode_KIND(self);
9149 data_self = PyUnicode_DATA(self);
9150 kind_sub = PyUnicode_KIND(substring);
9151 data_sub = PyUnicode_DATA(substring);
9152 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9153
9154 if (direction > 0)
9155 offset = end;
9156 else
9157 offset = start;
9158
9159 if (PyUnicode_READ(kind_self, data_self, offset) ==
9160 PyUnicode_READ(kind_sub, data_sub, 0) &&
9161 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9162 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9163 /* If both are of the same kind, memcmp is sufficient */
9164 if (kind_self == kind_sub) {
9165 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009166 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167 data_sub,
9168 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009169 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009170 }
9171 /* otherwise we have to compare each character by first accesing it */
9172 else {
9173 /* We do not need to compare 0 and len(substring)-1 because
9174 the if statement above ensured already that they are equal
9175 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02009176 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009177 for (i = 1; i < end_sub; ++i) {
9178 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9179 PyUnicode_READ(kind_sub, data_sub, i))
9180 return 0;
9181 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009182 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009184 }
9185
9186 return 0;
9187}
9188
Alexander Belopolsky40018472011-02-26 01:02:56 +00009189Py_ssize_t
9190PyUnicode_Tailmatch(PyObject *str,
9191 PyObject *substr,
9192 Py_ssize_t start,
9193 Py_ssize_t end,
9194 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009195{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009196 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009197
Guido van Rossumd57fd912000-03-10 22:53:23 +00009198 str = PyUnicode_FromObject(str);
9199 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009200 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009201 substr = PyUnicode_FromObject(substr);
9202 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009203 Py_DECREF(str);
9204 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009205 }
Tim Petersced69f82003-09-16 20:30:58 +00009206
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009207 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009208 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009209 Py_DECREF(str);
9210 Py_DECREF(substr);
9211 return result;
9212}
9213
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214/* Apply fixfct filter to the Unicode object self and return a
9215 reference to the modified object */
9216
Alexander Belopolsky40018472011-02-26 01:02:56 +00009217static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009218fixup(PyObject *self,
9219 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009220{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009221 PyObject *u;
9222 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009223 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009224
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009225 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009227 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009228 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009229
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230 /* fix functions return the new maximum character in a string,
9231 if the kind of the resulting unicode object does not change,
9232 everything is fine. Otherwise we need to change the string kind
9233 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009234 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009235
9236 if (maxchar_new == 0) {
9237 /* no changes */;
9238 if (PyUnicode_CheckExact(self)) {
9239 Py_DECREF(u);
9240 Py_INCREF(self);
9241 return self;
9242 }
9243 else
9244 return u;
9245 }
9246
Victor Stinnere6abb482012-05-02 01:15:40 +02009247 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248
Victor Stinnereaab6042011-12-11 22:22:39 +01009249 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009251
9252 /* In case the maximum character changed, we need to
9253 convert the string to the new category. */
9254 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9255 if (v == NULL) {
9256 Py_DECREF(u);
9257 return NULL;
9258 }
9259 if (maxchar_new > maxchar_old) {
9260 /* If the maxchar increased so that the kind changed, not all
9261 characters are representable anymore and we need to fix the
9262 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009263 _PyUnicode_FastCopyCharacters(v, 0,
9264 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009265 maxchar_old = fixfct(v);
9266 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 }
9268 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009269 _PyUnicode_FastCopyCharacters(v, 0,
9270 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009272 Py_DECREF(u);
9273 assert(_PyUnicode_CheckConsistency(v, 1));
9274 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009275}
9276
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009277static PyObject *
9278ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009280 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9281 char *resdata, *data = PyUnicode_DATA(self);
9282 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009283
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009284 res = PyUnicode_New(len, 127);
9285 if (res == NULL)
9286 return NULL;
9287 resdata = PyUnicode_DATA(res);
9288 if (lower)
9289 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009291 _Py_bytes_upper(resdata, data, len);
9292 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293}
9294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009296handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009297{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009298 Py_ssize_t j;
9299 int final_sigma;
9300 Py_UCS4 c;
9301 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009302
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009303 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9304
9305 where ! is a negation and \p{xxx} is a character with property xxx.
9306 */
9307 for (j = i - 1; j >= 0; j--) {
9308 c = PyUnicode_READ(kind, data, j);
9309 if (!_PyUnicode_IsCaseIgnorable(c))
9310 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009311 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009312 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9313 if (final_sigma) {
9314 for (j = i + 1; j < length; j++) {
9315 c = PyUnicode_READ(kind, data, j);
9316 if (!_PyUnicode_IsCaseIgnorable(c))
9317 break;
9318 }
9319 final_sigma = j == length || !_PyUnicode_IsCased(c);
9320 }
9321 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322}
9323
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009324static int
9325lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9326 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009327{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009328 /* Obscure special case. */
9329 if (c == 0x3A3) {
9330 mapped[0] = handle_capital_sigma(kind, data, length, i);
9331 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009332 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009333 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009334}
9335
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009336static Py_ssize_t
9337do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009338{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009339 Py_ssize_t i, k = 0;
9340 int n_res, j;
9341 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009342
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009343 c = PyUnicode_READ(kind, data, 0);
9344 n_res = _PyUnicode_ToUpperFull(c, mapped);
9345 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009346 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009347 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009348 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009349 for (i = 1; i < length; i++) {
9350 c = PyUnicode_READ(kind, data, i);
9351 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9352 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009353 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009354 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009355 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009356 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009357 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358}
9359
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009360static Py_ssize_t
9361do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9362 Py_ssize_t i, k = 0;
9363
9364 for (i = 0; i < length; i++) {
9365 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9366 int n_res, j;
9367 if (Py_UNICODE_ISUPPER(c)) {
9368 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9369 }
9370 else if (Py_UNICODE_ISLOWER(c)) {
9371 n_res = _PyUnicode_ToUpperFull(c, mapped);
9372 }
9373 else {
9374 n_res = 1;
9375 mapped[0] = c;
9376 }
9377 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009378 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009379 res[k++] = mapped[j];
9380 }
9381 }
9382 return k;
9383}
9384
9385static Py_ssize_t
9386do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9387 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009388{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009389 Py_ssize_t i, k = 0;
9390
9391 for (i = 0; i < length; i++) {
9392 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9393 int n_res, j;
9394 if (lower)
9395 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9396 else
9397 n_res = _PyUnicode_ToUpperFull(c, mapped);
9398 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009399 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009400 res[k++] = mapped[j];
9401 }
9402 }
9403 return k;
9404}
9405
9406static Py_ssize_t
9407do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9408{
9409 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9410}
9411
9412static Py_ssize_t
9413do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9414{
9415 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9416}
9417
Benjamin Petersone51757f2012-01-12 21:10:29 -05009418static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009419do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9420{
9421 Py_ssize_t i, k = 0;
9422
9423 for (i = 0; i < length; i++) {
9424 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9425 Py_UCS4 mapped[3];
9426 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9427 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009428 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009429 res[k++] = mapped[j];
9430 }
9431 }
9432 return k;
9433}
9434
9435static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009436do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9437{
9438 Py_ssize_t i, k = 0;
9439 int previous_is_cased;
9440
9441 previous_is_cased = 0;
9442 for (i = 0; i < length; i++) {
9443 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9444 Py_UCS4 mapped[3];
9445 int n_res, j;
9446
9447 if (previous_is_cased)
9448 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9449 else
9450 n_res = _PyUnicode_ToTitleFull(c, mapped);
9451
9452 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009453 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009454 res[k++] = mapped[j];
9455 }
9456
9457 previous_is_cased = _PyUnicode_IsCased(c);
9458 }
9459 return k;
9460}
9461
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009462static PyObject *
9463case_operation(PyObject *self,
9464 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9465{
9466 PyObject *res = NULL;
9467 Py_ssize_t length, newlength = 0;
9468 int kind, outkind;
9469 void *data, *outdata;
9470 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9471
Benjamin Petersoneea48462012-01-16 14:28:50 -05009472 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009473
9474 kind = PyUnicode_KIND(self);
9475 data = PyUnicode_DATA(self);
9476 length = PyUnicode_GET_LENGTH(self);
9477 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9478 if (tmp == NULL)
9479 return PyErr_NoMemory();
9480 newlength = perform(kind, data, length, tmp, &maxchar);
9481 res = PyUnicode_New(newlength, maxchar);
9482 if (res == NULL)
9483 goto leave;
9484 tmpend = tmp + newlength;
9485 outdata = PyUnicode_DATA(res);
9486 outkind = PyUnicode_KIND(res);
9487 switch (outkind) {
9488 case PyUnicode_1BYTE_KIND:
9489 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9490 break;
9491 case PyUnicode_2BYTE_KIND:
9492 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9493 break;
9494 case PyUnicode_4BYTE_KIND:
9495 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9496 break;
9497 default:
9498 assert(0);
9499 break;
9500 }
9501 leave:
9502 PyMem_FREE(tmp);
9503 return res;
9504}
9505
Tim Peters8ce9f162004-08-27 01:49:32 +00009506PyObject *
9507PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009510 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009512 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009513 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9514 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009515 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009517 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009519 int use_memcpy;
9520 unsigned char *res_data = NULL, *sep_data = NULL;
9521 PyObject *last_obj;
9522 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523
Tim Peters05eba1f2004-08-27 21:32:02 +00009524 fseq = PySequence_Fast(seq, "");
9525 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009526 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009527 }
9528
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009529 /* NOTE: the following code can't call back into Python code,
9530 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009531 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009532
Tim Peters05eba1f2004-08-27 21:32:02 +00009533 seqlen = PySequence_Fast_GET_SIZE(fseq);
9534 /* If empty sequence, return u"". */
9535 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009536 Py_DECREF(fseq);
9537 Py_INCREF(unicode_empty);
9538 res = unicode_empty;
9539 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009540 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009541
Tim Peters05eba1f2004-08-27 21:32:02 +00009542 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009543 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009544 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009545 if (seqlen == 1) {
9546 if (PyUnicode_CheckExact(items[0])) {
9547 res = items[0];
9548 Py_INCREF(res);
9549 Py_DECREF(fseq);
9550 return res;
9551 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009552 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009553 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009554 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009555 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009556 /* Set up sep and seplen */
9557 if (separator == NULL) {
9558 /* fall back to a blank space separator */
9559 sep = PyUnicode_FromOrdinal(' ');
9560 if (!sep)
9561 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009562 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009563 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009564 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009565 else {
9566 if (!PyUnicode_Check(separator)) {
9567 PyErr_Format(PyExc_TypeError,
9568 "separator: expected str instance,"
9569 " %.80s found",
9570 Py_TYPE(separator)->tp_name);
9571 goto onError;
9572 }
9573 if (PyUnicode_READY(separator))
9574 goto onError;
9575 sep = separator;
9576 seplen = PyUnicode_GET_LENGTH(separator);
9577 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9578 /* inc refcount to keep this code path symmetric with the
9579 above case of a blank separator */
9580 Py_INCREF(sep);
9581 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009582 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009583 }
9584
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009585 /* There are at least two things to join, or else we have a subclass
9586 * of str in the sequence.
9587 * Do a pre-pass to figure out the total amount of space we'll
9588 * need (sz), and see whether all argument are strings.
9589 */
9590 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009591#ifdef Py_DEBUG
9592 use_memcpy = 0;
9593#else
9594 use_memcpy = 1;
9595#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009596 for (i = 0; i < seqlen; i++) {
9597 const Py_ssize_t old_sz = sz;
9598 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009599 if (!PyUnicode_Check(item)) {
9600 PyErr_Format(PyExc_TypeError,
9601 "sequence item %zd: expected str instance,"
9602 " %.80s found",
9603 i, Py_TYPE(item)->tp_name);
9604 goto onError;
9605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 if (PyUnicode_READY(item) == -1)
9607 goto onError;
9608 sz += PyUnicode_GET_LENGTH(item);
9609 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009610 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009611 if (i != 0)
9612 sz += seplen;
9613 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9614 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009615 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009616 goto onError;
9617 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009618 if (use_memcpy && last_obj != NULL) {
9619 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9620 use_memcpy = 0;
9621 }
9622 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009623 }
Tim Petersced69f82003-09-16 20:30:58 +00009624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009626 if (res == NULL)
9627 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009628
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009629 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009630#ifdef Py_DEBUG
9631 use_memcpy = 0;
9632#else
9633 if (use_memcpy) {
9634 res_data = PyUnicode_1BYTE_DATA(res);
9635 kind = PyUnicode_KIND(res);
9636 if (seplen != 0)
9637 sep_data = PyUnicode_1BYTE_DATA(sep);
9638 }
9639#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009641 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009642 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009643 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009644 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009645 if (use_memcpy) {
9646 Py_MEMCPY(res_data,
9647 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009648 kind * seplen);
9649 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009650 }
9651 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009652 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009653 res_offset += seplen;
9654 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009655 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009656 itemlen = PyUnicode_GET_LENGTH(item);
9657 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009658 if (use_memcpy) {
9659 Py_MEMCPY(res_data,
9660 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009661 kind * itemlen);
9662 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009663 }
9664 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009665 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009666 res_offset += itemlen;
9667 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009668 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009669 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009670 if (use_memcpy)
9671 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009672 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009673 else
9674 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009675
Tim Peters05eba1f2004-08-27 21:32:02 +00009676 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009677 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009678 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680
Benjamin Peterson29060642009-01-31 22:14:21 +00009681 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009682 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009684 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685 return NULL;
9686}
9687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009688#define FILL(kind, data, value, start, length) \
9689 do { \
9690 Py_ssize_t i_ = 0; \
9691 assert(kind != PyUnicode_WCHAR_KIND); \
9692 switch ((kind)) { \
9693 case PyUnicode_1BYTE_KIND: { \
9694 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009695 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009696 break; \
9697 } \
9698 case PyUnicode_2BYTE_KIND: { \
9699 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9700 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9701 break; \
9702 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009703 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009704 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9705 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9706 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009707 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708 } \
9709 } \
9710 } while (0)
9711
Victor Stinnerd3f08822012-05-29 12:57:52 +02009712void
9713_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9714 Py_UCS4 fill_char)
9715{
9716 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9717 const void *data = PyUnicode_DATA(unicode);
9718 assert(PyUnicode_IS_READY(unicode));
9719 assert(unicode_modifiable(unicode));
9720 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9721 assert(start >= 0);
9722 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9723 FILL(kind, data, fill_char, start, length);
9724}
9725
Victor Stinner3fe55312012-01-04 00:33:50 +01009726Py_ssize_t
9727PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9728 Py_UCS4 fill_char)
9729{
9730 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009731
9732 if (!PyUnicode_Check(unicode)) {
9733 PyErr_BadInternalCall();
9734 return -1;
9735 }
9736 if (PyUnicode_READY(unicode) == -1)
9737 return -1;
9738 if (unicode_check_modifiable(unicode))
9739 return -1;
9740
Victor Stinnerd3f08822012-05-29 12:57:52 +02009741 if (start < 0) {
9742 PyErr_SetString(PyExc_IndexError, "string index out of range");
9743 return -1;
9744 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009745 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9746 PyErr_SetString(PyExc_ValueError,
9747 "fill character is bigger than "
9748 "the string maximum character");
9749 return -1;
9750 }
9751
9752 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9753 length = Py_MIN(maxlen, length);
9754 if (length <= 0)
9755 return 0;
9756
Victor Stinnerd3f08822012-05-29 12:57:52 +02009757 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009758 return length;
9759}
9760
Victor Stinner9310abb2011-10-05 00:59:23 +02009761static PyObject *
9762pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009763 Py_ssize_t left,
9764 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009765 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009766{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 PyObject *u;
9768 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009769 int kind;
9770 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771
9772 if (left < 0)
9773 left = 0;
9774 if (right < 0)
9775 right = 0;
9776
Victor Stinnerc4b49542011-12-11 22:44:26 +01009777 if (left == 0 && right == 0)
9778 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9781 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009782 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9783 return NULL;
9784 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009786 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009788 if (!u)
9789 return NULL;
9790
9791 kind = PyUnicode_KIND(u);
9792 data = PyUnicode_DATA(u);
9793 if (left)
9794 FILL(kind, data, fill, 0, left);
9795 if (right)
9796 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009797 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009798 assert(_PyUnicode_CheckConsistency(u, 1));
9799 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009800}
9801
Alexander Belopolsky40018472011-02-26 01:02:56 +00009802PyObject *
9803PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009804{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009806
9807 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009808 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009809 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009810 if (PyUnicode_READY(string) == -1) {
9811 Py_DECREF(string);
9812 return NULL;
9813 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009814
Benjamin Petersonead6b532011-12-20 17:23:42 -06009815 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009817 if (PyUnicode_IS_ASCII(string))
9818 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009819 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009820 PyUnicode_GET_LENGTH(string), keepends);
9821 else
9822 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009823 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009824 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 break;
9826 case PyUnicode_2BYTE_KIND:
9827 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009828 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829 PyUnicode_GET_LENGTH(string), keepends);
9830 break;
9831 case PyUnicode_4BYTE_KIND:
9832 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009833 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 PyUnicode_GET_LENGTH(string), keepends);
9835 break;
9836 default:
9837 assert(0);
9838 list = 0;
9839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009840 Py_DECREF(string);
9841 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009842}
9843
Alexander Belopolsky40018472011-02-26 01:02:56 +00009844static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009845split(PyObject *self,
9846 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009847 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009848{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849 int kind1, kind2, kind;
9850 void *buf1, *buf2;
9851 Py_ssize_t len1, len2;
9852 PyObject* out;
9853
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009855 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 if (PyUnicode_READY(self) == -1)
9858 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009860 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009861 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009863 if (PyUnicode_IS_ASCII(self))
9864 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009865 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009866 PyUnicode_GET_LENGTH(self), maxcount
9867 );
9868 else
9869 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009870 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009871 PyUnicode_GET_LENGTH(self), maxcount
9872 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 case PyUnicode_2BYTE_KIND:
9874 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009875 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 PyUnicode_GET_LENGTH(self), maxcount
9877 );
9878 case PyUnicode_4BYTE_KIND:
9879 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009880 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881 PyUnicode_GET_LENGTH(self), maxcount
9882 );
9883 default:
9884 assert(0);
9885 return NULL;
9886 }
9887
9888 if (PyUnicode_READY(substring) == -1)
9889 return NULL;
9890
9891 kind1 = PyUnicode_KIND(self);
9892 kind2 = PyUnicode_KIND(substring);
9893 kind = kind1 > kind2 ? kind1 : kind2;
9894 buf1 = PyUnicode_DATA(self);
9895 buf2 = PyUnicode_DATA(substring);
9896 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009897 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 if (!buf1)
9899 return NULL;
9900 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009901 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 if (!buf2) {
9903 if (kind1 != kind) PyMem_Free(buf1);
9904 return NULL;
9905 }
9906 len1 = PyUnicode_GET_LENGTH(self);
9907 len2 = PyUnicode_GET_LENGTH(substring);
9908
Benjamin Petersonead6b532011-12-20 17:23:42 -06009909 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009911 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9912 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009913 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009914 else
9915 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009916 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 break;
9918 case PyUnicode_2BYTE_KIND:
9919 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009920 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 break;
9922 case PyUnicode_4BYTE_KIND:
9923 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009924 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 break;
9926 default:
9927 out = NULL;
9928 }
9929 if (kind1 != kind)
9930 PyMem_Free(buf1);
9931 if (kind2 != kind)
9932 PyMem_Free(buf2);
9933 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009934}
9935
Alexander Belopolsky40018472011-02-26 01:02:56 +00009936static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009937rsplit(PyObject *self,
9938 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009939 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009940{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 int kind1, kind2, kind;
9942 void *buf1, *buf2;
9943 Py_ssize_t len1, len2;
9944 PyObject* out;
9945
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009946 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009947 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009948
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 if (PyUnicode_READY(self) == -1)
9950 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009953 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009955 if (PyUnicode_IS_ASCII(self))
9956 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009957 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009958 PyUnicode_GET_LENGTH(self), maxcount
9959 );
9960 else
9961 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009962 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009963 PyUnicode_GET_LENGTH(self), maxcount
9964 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 case PyUnicode_2BYTE_KIND:
9966 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009967 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 PyUnicode_GET_LENGTH(self), maxcount
9969 );
9970 case PyUnicode_4BYTE_KIND:
9971 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009972 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973 PyUnicode_GET_LENGTH(self), maxcount
9974 );
9975 default:
9976 assert(0);
9977 return NULL;
9978 }
9979
9980 if (PyUnicode_READY(substring) == -1)
9981 return NULL;
9982
9983 kind1 = PyUnicode_KIND(self);
9984 kind2 = PyUnicode_KIND(substring);
9985 kind = kind1 > kind2 ? kind1 : kind2;
9986 buf1 = PyUnicode_DATA(self);
9987 buf2 = PyUnicode_DATA(substring);
9988 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009989 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990 if (!buf1)
9991 return NULL;
9992 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009993 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009994 if (!buf2) {
9995 if (kind1 != kind) PyMem_Free(buf1);
9996 return NULL;
9997 }
9998 len1 = PyUnicode_GET_LENGTH(self);
9999 len2 = PyUnicode_GET_LENGTH(substring);
10000
Benjamin Petersonead6b532011-12-20 17:23:42 -060010001 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010002 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010003 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10004 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010005 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010006 else
10007 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010008 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 break;
10010 case PyUnicode_2BYTE_KIND:
10011 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010012 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 break;
10014 case PyUnicode_4BYTE_KIND:
10015 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010016 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 break;
10018 default:
10019 out = NULL;
10020 }
10021 if (kind1 != kind)
10022 PyMem_Free(buf1);
10023 if (kind2 != kind)
10024 PyMem_Free(buf2);
10025 return out;
10026}
10027
10028static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010029anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10030 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010032 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010033 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010034 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10035 return asciilib_find(buf1, len1, buf2, len2, offset);
10036 else
10037 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 case PyUnicode_2BYTE_KIND:
10039 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10040 case PyUnicode_4BYTE_KIND:
10041 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10042 }
10043 assert(0);
10044 return -1;
10045}
10046
10047static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010048anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10049 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010051 switch (kind) {
10052 case PyUnicode_1BYTE_KIND:
10053 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10054 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10055 else
10056 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10057 case PyUnicode_2BYTE_KIND:
10058 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10059 case PyUnicode_4BYTE_KIND:
10060 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10061 }
10062 assert(0);
10063 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010064}
10065
Alexander Belopolsky40018472011-02-26 01:02:56 +000010066static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010067replace(PyObject *self, PyObject *str1,
10068 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010069{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 PyObject *u;
10071 char *sbuf = PyUnicode_DATA(self);
10072 char *buf1 = PyUnicode_DATA(str1);
10073 char *buf2 = PyUnicode_DATA(str2);
10074 int srelease = 0, release1 = 0, release2 = 0;
10075 int skind = PyUnicode_KIND(self);
10076 int kind1 = PyUnicode_KIND(str1);
10077 int kind2 = PyUnicode_KIND(str2);
10078 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10079 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10080 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010081 int mayshrink;
10082 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010083
10084 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010085 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010087 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010088
Victor Stinner59de0ee2011-10-07 10:01:28 +020010089 if (str1 == str2)
10090 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 if (skind < kind1)
10092 /* substring too wide to be present */
10093 goto nothing;
10094
Victor Stinner49a0a212011-10-12 23:46:10 +020010095 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10096 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10097 /* Replacing str1 with str2 may cause a maxchar reduction in the
10098 result string. */
10099 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +020010100 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010103 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010105 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010107 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010108 Py_UCS4 u1, u2;
10109 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010110 Py_ssize_t index, pos;
10111 char *src;
10112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010114 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10115 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010116 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010119 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010121 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010123
10124 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10125 index = 0;
10126 src = sbuf;
10127 while (--maxcount)
10128 {
10129 pos++;
10130 src += pos * PyUnicode_KIND(self);
10131 slen -= pos;
10132 index += pos;
10133 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10134 if (pos < 0)
10135 break;
10136 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10137 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010138 }
10139 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010140 int rkind = skind;
10141 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010142 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 if (kind1 < rkind) {
10145 /* widen substring */
10146 buf1 = _PyUnicode_AsKind(str1, rkind);
10147 if (!buf1) goto error;
10148 release1 = 1;
10149 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010150 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010151 if (i < 0)
10152 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 if (rkind > kind2) {
10154 /* widen replacement */
10155 buf2 = _PyUnicode_AsKind(str2, rkind);
10156 if (!buf2) goto error;
10157 release2 = 1;
10158 }
10159 else if (rkind < kind2) {
10160 /* widen self and buf1 */
10161 rkind = kind2;
10162 if (release1) PyMem_Free(buf1);
10163 sbuf = _PyUnicode_AsKind(self, rkind);
10164 if (!sbuf) goto error;
10165 srelease = 1;
10166 buf1 = _PyUnicode_AsKind(str1, rkind);
10167 if (!buf1) goto error;
10168 release1 = 1;
10169 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010170 u = PyUnicode_New(slen, maxchar);
10171 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010173 assert(PyUnicode_KIND(u) == rkind);
10174 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010175
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010176 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010177 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010178 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010180 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010182
10183 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010184 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010185 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010186 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010187 if (i == -1)
10188 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010189 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010191 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010195 }
10196 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 Py_ssize_t n, i, j, ires;
10198 Py_ssize_t product, new_size;
10199 int rkind = skind;
10200 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010202 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010203 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204 buf1 = _PyUnicode_AsKind(str1, rkind);
10205 if (!buf1) goto error;
10206 release1 = 1;
10207 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010208 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010209 if (n == 0)
10210 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010212 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 buf2 = _PyUnicode_AsKind(str2, rkind);
10214 if (!buf2) goto error;
10215 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010216 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010218 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 rkind = kind2;
10220 sbuf = _PyUnicode_AsKind(self, rkind);
10221 if (!sbuf) goto error;
10222 srelease = 1;
10223 if (release1) PyMem_Free(buf1);
10224 buf1 = _PyUnicode_AsKind(str1, rkind);
10225 if (!buf1) goto error;
10226 release1 = 1;
10227 }
10228 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10229 PyUnicode_GET_LENGTH(str1))); */
10230 product = n * (len2-len1);
10231 if ((product / (len2-len1)) != n) {
10232 PyErr_SetString(PyExc_OverflowError,
10233 "replace string is too long");
10234 goto error;
10235 }
10236 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010237 if (new_size == 0) {
10238 Py_INCREF(unicode_empty);
10239 u = unicode_empty;
10240 goto done;
10241 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10243 PyErr_SetString(PyExc_OverflowError,
10244 "replace string is too long");
10245 goto error;
10246 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010247 u = PyUnicode_New(new_size, maxchar);
10248 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010250 assert(PyUnicode_KIND(u) == rkind);
10251 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 ires = i = 0;
10253 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010254 while (n-- > 0) {
10255 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010256 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010257 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010258 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010259 if (j == -1)
10260 break;
10261 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010262 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010263 memcpy(res + rkind * ires,
10264 sbuf + rkind * i,
10265 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010267 }
10268 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010270 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010272 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010276 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010278 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010279 memcpy(res + rkind * ires,
10280 sbuf + rkind * i,
10281 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010282 }
10283 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010284 /* interleave */
10285 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010286 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010288 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010290 if (--n <= 0)
10291 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010292 memcpy(res + rkind * ires,
10293 sbuf + rkind * i,
10294 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 ires++;
10296 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010297 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010298 memcpy(res + rkind * ires,
10299 sbuf + rkind * i,
10300 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010301 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010302 }
10303
10304 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010305 unicode_adjust_maxchar(&u);
10306 if (u == NULL)
10307 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010309
10310 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 if (srelease)
10312 PyMem_FREE(sbuf);
10313 if (release1)
10314 PyMem_FREE(buf1);
10315 if (release2)
10316 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010317 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010319
Benjamin Peterson29060642009-01-31 22:14:21 +000010320 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010321 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 if (srelease)
10323 PyMem_FREE(sbuf);
10324 if (release1)
10325 PyMem_FREE(buf1);
10326 if (release2)
10327 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010328 return unicode_result_unchanged(self);
10329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 error:
10331 if (srelease && sbuf)
10332 PyMem_FREE(sbuf);
10333 if (release1 && buf1)
10334 PyMem_FREE(buf1);
10335 if (release2 && buf2)
10336 PyMem_FREE(buf2);
10337 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010338}
10339
10340/* --- Unicode Object Methods --------------------------------------------- */
10341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010342PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010343 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010344\n\
10345Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010346characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347
10348static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010349unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010350{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010351 if (PyUnicode_READY(self) == -1)
10352 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010353 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010354}
10355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010356PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010357 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010358\n\
10359Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010360have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010361
10362static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010363unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010364{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010365 if (PyUnicode_READY(self) == -1)
10366 return NULL;
10367 if (PyUnicode_GET_LENGTH(self) == 0)
10368 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010369 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370}
10371
Benjamin Petersond5890c82012-01-14 13:23:30 -050010372PyDoc_STRVAR(casefold__doc__,
10373 "S.casefold() -> str\n\
10374\n\
10375Return a version of S suitable for caseless comparisons.");
10376
10377static PyObject *
10378unicode_casefold(PyObject *self)
10379{
10380 if (PyUnicode_READY(self) == -1)
10381 return NULL;
10382 if (PyUnicode_IS_ASCII(self))
10383 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010384 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010385}
10386
10387
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010388/* Argument converter. Coerces to a single unicode character */
10389
10390static int
10391convert_uc(PyObject *obj, void *addr)
10392{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010394 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010395
Benjamin Peterson14339b62009-01-31 16:36:08 +000010396 uniobj = PyUnicode_FromObject(obj);
10397 if (uniobj == NULL) {
10398 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010399 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010400 return 0;
10401 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010403 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010404 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010405 Py_DECREF(uniobj);
10406 return 0;
10407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010409 Py_DECREF(uniobj);
10410 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010411}
10412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010413PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010414 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010415\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010416Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010417done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010418
10419static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010420unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010422 Py_ssize_t marg, left;
10423 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 Py_UCS4 fillchar = ' ';
10425
Victor Stinnere9a29352011-10-01 02:14:59 +020010426 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010428
Benjamin Petersonbac79492012-01-14 13:34:47 -050010429 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010430 return NULL;
10431
Victor Stinnerc4b49542011-12-11 22:44:26 +010010432 if (PyUnicode_GET_LENGTH(self) >= width)
10433 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010434
Victor Stinnerc4b49542011-12-11 22:44:26 +010010435 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010436 left = marg / 2 + (marg & width & 1);
10437
Victor Stinner9310abb2011-10-05 00:59:23 +020010438 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010439}
10440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441/* This function assumes that str1 and str2 are readied by the caller. */
10442
Marc-André Lemburge5034372000-08-08 08:04:29 +000010443static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010444unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010445{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 int kind1, kind2;
10447 void *data1, *data2;
Victor Stinner770e19e2012-10-04 22:59:45 +020010448 Py_ssize_t len1, len2;
10449 Py_ssize_t i, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010450
Victor Stinner90db9c42012-10-04 21:53:50 +020010451 /* a string is equal to itself */
10452 if (str1 == str2)
10453 return 0;
10454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 kind1 = PyUnicode_KIND(str1);
10456 kind2 = PyUnicode_KIND(str2);
10457 data1 = PyUnicode_DATA(str1);
10458 data2 = PyUnicode_DATA(str2);
10459 len1 = PyUnicode_GET_LENGTH(str1);
10460 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010461 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010462
Victor Stinner770e19e2012-10-04 22:59:45 +020010463 if (kind1 == 1 && kind2 == 1) {
10464 int cmp = memcmp(data1, data2, len);
10465 /* normalize result of memcmp() into the range [-1; 1] */
10466 if (cmp < 0)
10467 return -1;
10468 if (cmp > 0)
10469 return 1;
10470 }
10471 else {
10472 for (i = 0; i < len; ++i) {
10473 Py_UCS4 c1, c2;
10474 c1 = PyUnicode_READ(kind1, data1, i);
10475 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010476
Victor Stinner770e19e2012-10-04 22:59:45 +020010477 if (c1 != c2)
10478 return (c1 < c2) ? -1 : 1;
10479 }
Marc-André Lemburge5034372000-08-08 08:04:29 +000010480 }
10481
Victor Stinner770e19e2012-10-04 22:59:45 +020010482 if (len1 == len2)
10483 return 0;
10484 if (len1 < len2)
10485 return -1;
10486 else
10487 return 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010488}
10489
Alexander Belopolsky40018472011-02-26 01:02:56 +000010490int
10491PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10494 if (PyUnicode_READY(left) == -1 ||
10495 PyUnicode_READY(right) == -1)
10496 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010497 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010498 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010499 PyErr_Format(PyExc_TypeError,
10500 "Can't compare %.100s and %.100s",
10501 left->ob_type->tp_name,
10502 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503 return -1;
10504}
10505
Martin v. Löwis5b222132007-06-10 09:51:05 +000010506int
10507PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10508{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 Py_ssize_t i;
10510 int kind;
10511 void *data;
10512 Py_UCS4 chr;
10513
Victor Stinner910337b2011-10-03 03:20:16 +020010514 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 if (PyUnicode_READY(uni) == -1)
10516 return -1;
10517 kind = PyUnicode_KIND(uni);
10518 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010519 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10521 if (chr != str[i])
10522 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010523 /* This check keeps Python strings that end in '\0' from comparing equal
10524 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010526 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010527 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010528 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010529 return 0;
10530}
10531
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010532
Benjamin Peterson29060642009-01-31 22:14:21 +000010533#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010534 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010535
Alexander Belopolsky40018472011-02-26 01:02:56 +000010536PyObject *
10537PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010538{
10539 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010540
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010541 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10542 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 if (PyUnicode_READY(left) == -1 ||
10544 PyUnicode_READY(right) == -1)
10545 return NULL;
10546 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10547 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010548 if (op == Py_EQ) {
10549 Py_INCREF(Py_False);
10550 return Py_False;
10551 }
10552 if (op == Py_NE) {
10553 Py_INCREF(Py_True);
10554 return Py_True;
10555 }
10556 }
Victor Stinner90db9c42012-10-04 21:53:50 +020010557 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010558
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010559 /* Convert the return value to a Boolean */
10560 switch (op) {
10561 case Py_EQ:
10562 v = TEST_COND(result == 0);
10563 break;
10564 case Py_NE:
10565 v = TEST_COND(result != 0);
10566 break;
10567 case Py_LE:
10568 v = TEST_COND(result <= 0);
10569 break;
10570 case Py_GE:
10571 v = TEST_COND(result >= 0);
10572 break;
10573 case Py_LT:
10574 v = TEST_COND(result == -1);
10575 break;
10576 case Py_GT:
10577 v = TEST_COND(result == 1);
10578 break;
10579 default:
10580 PyErr_BadArgument();
10581 return NULL;
10582 }
10583 Py_INCREF(v);
10584 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010585 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010586
Brian Curtindfc80e32011-08-10 20:28:54 -050010587 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010588}
10589
Alexander Belopolsky40018472011-02-26 01:02:56 +000010590int
10591PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010592{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010593 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 int kind1, kind2, kind;
10595 void *buf1, *buf2;
10596 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010597 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010598
10599 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010600 sub = PyUnicode_FromObject(element);
10601 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010602 PyErr_Format(PyExc_TypeError,
10603 "'in <string>' requires string as left operand, not %s",
10604 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010605 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010606 }
10607
Thomas Wouters477c8d52006-05-27 19:21:47 +000010608 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010609 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010610 Py_DECREF(sub);
10611 return -1;
10612 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010613 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10614 Py_DECREF(sub);
10615 Py_DECREF(str);
10616 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 kind1 = PyUnicode_KIND(str);
10619 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010620 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 buf1 = PyUnicode_DATA(str);
10622 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010623 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010624 if (kind2 > kind) {
10625 Py_DECREF(sub);
10626 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010627 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010628 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010629 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010630 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 if (!buf2) {
10632 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010633 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 return -1;
10635 }
10636 len1 = PyUnicode_GET_LENGTH(str);
10637 len2 = PyUnicode_GET_LENGTH(sub);
10638
Benjamin Petersonead6b532011-12-20 17:23:42 -060010639 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 case PyUnicode_1BYTE_KIND:
10641 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10642 break;
10643 case PyUnicode_2BYTE_KIND:
10644 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10645 break;
10646 case PyUnicode_4BYTE_KIND:
10647 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10648 break;
10649 default:
10650 result = -1;
10651 assert(0);
10652 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010653
10654 Py_DECREF(str);
10655 Py_DECREF(sub);
10656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 if (kind2 != kind)
10658 PyMem_Free(buf2);
10659
Guido van Rossum403d68b2000-03-13 15:55:09 +000010660 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010661}
10662
Guido van Rossumd57fd912000-03-10 22:53:23 +000010663/* Concat to string or Unicode object giving a new Unicode object. */
10664
Alexander Belopolsky40018472011-02-26 01:02:56 +000010665PyObject *
10666PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010667{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010669 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010670 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671
10672 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010674 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010675 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010677 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010678 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679
10680 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010681 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010682 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010684 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010685 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010686 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010688 }
10689
Victor Stinner488fa492011-12-12 00:01:39 +010010690 u_len = PyUnicode_GET_LENGTH(u);
10691 v_len = PyUnicode_GET_LENGTH(v);
10692 if (u_len > PY_SSIZE_T_MAX - v_len) {
10693 PyErr_SetString(PyExc_OverflowError,
10694 "strings are too large to concat");
10695 goto onError;
10696 }
10697 new_len = u_len + v_len;
10698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010700 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010701 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010702
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010704 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010706 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010707 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10708 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709 Py_DECREF(u);
10710 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010711 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713
Benjamin Peterson29060642009-01-31 22:14:21 +000010714 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715 Py_XDECREF(u);
10716 Py_XDECREF(v);
10717 return NULL;
10718}
10719
Walter Dörwald1ab83302007-05-18 17:15:44 +000010720void
Victor Stinner23e56682011-10-03 03:54:37 +020010721PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010722{
Victor Stinner23e56682011-10-03 03:54:37 +020010723 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010724 Py_UCS4 maxchar, maxchar2;
10725 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010726
10727 if (p_left == NULL) {
10728 if (!PyErr_Occurred())
10729 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010730 return;
10731 }
Victor Stinner23e56682011-10-03 03:54:37 +020010732 left = *p_left;
10733 if (right == NULL || !PyUnicode_Check(left)) {
10734 if (!PyErr_Occurred())
10735 PyErr_BadInternalCall();
10736 goto error;
10737 }
10738
Benjamin Petersonbac79492012-01-14 13:34:47 -050010739 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010740 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010741 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010742 goto error;
10743
Victor Stinner488fa492011-12-12 00:01:39 +010010744 /* Shortcuts */
10745 if (left == unicode_empty) {
10746 Py_DECREF(left);
10747 Py_INCREF(right);
10748 *p_left = right;
10749 return;
10750 }
10751 if (right == unicode_empty)
10752 return;
10753
10754 left_len = PyUnicode_GET_LENGTH(left);
10755 right_len = PyUnicode_GET_LENGTH(right);
10756 if (left_len > PY_SSIZE_T_MAX - right_len) {
10757 PyErr_SetString(PyExc_OverflowError,
10758 "strings are too large to concat");
10759 goto error;
10760 }
10761 new_len = left_len + right_len;
10762
10763 if (unicode_modifiable(left)
10764 && PyUnicode_CheckExact(right)
10765 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010766 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10767 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010768 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010769 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010770 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10771 {
10772 /* append inplace */
10773 if (unicode_resize(p_left, new_len) != 0) {
10774 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10775 * deallocated so it cannot be put back into
10776 * 'variable'. The MemoryError is raised when there
10777 * is no value in 'variable', which might (very
10778 * remotely) be a cause of incompatibilities.
10779 */
10780 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010781 }
Victor Stinner488fa492011-12-12 00:01:39 +010010782 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010783 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010784 }
Victor Stinner488fa492011-12-12 00:01:39 +010010785 else {
10786 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10787 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010788 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010789
Victor Stinner488fa492011-12-12 00:01:39 +010010790 /* Concat the two Unicode strings */
10791 res = PyUnicode_New(new_len, maxchar);
10792 if (res == NULL)
10793 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010794 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10795 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010796 Py_DECREF(left);
10797 *p_left = res;
10798 }
10799 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010800 return;
10801
10802error:
Victor Stinner488fa492011-12-12 00:01:39 +010010803 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010804}
10805
10806void
10807PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10808{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010809 PyUnicode_Append(pleft, right);
10810 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010811}
10812
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010813PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010814 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010816Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010817string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010818interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819
10820static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010821unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010823 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010824 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010825 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010826 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827 int kind1, kind2, kind;
10828 void *buf1, *buf2;
10829 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830
Jesus Ceaac451502011-04-20 17:09:23 +020010831 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10832 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010833 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 kind1 = PyUnicode_KIND(self);
10836 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010837 if (kind2 > kind1)
10838 return PyLong_FromLong(0);
10839 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010840 buf1 = PyUnicode_DATA(self);
10841 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010843 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010844 if (!buf2) {
10845 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010846 return NULL;
10847 }
10848 len1 = PyUnicode_GET_LENGTH(self);
10849 len2 = PyUnicode_GET_LENGTH(substring);
10850
10851 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010852 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010853 case PyUnicode_1BYTE_KIND:
10854 iresult = ucs1lib_count(
10855 ((Py_UCS1*)buf1) + start, end - start,
10856 buf2, len2, PY_SSIZE_T_MAX
10857 );
10858 break;
10859 case PyUnicode_2BYTE_KIND:
10860 iresult = ucs2lib_count(
10861 ((Py_UCS2*)buf1) + start, end - start,
10862 buf2, len2, PY_SSIZE_T_MAX
10863 );
10864 break;
10865 case PyUnicode_4BYTE_KIND:
10866 iresult = ucs4lib_count(
10867 ((Py_UCS4*)buf1) + start, end - start,
10868 buf2, len2, PY_SSIZE_T_MAX
10869 );
10870 break;
10871 default:
10872 assert(0); iresult = 0;
10873 }
10874
10875 result = PyLong_FromSsize_t(iresult);
10876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010877 if (kind2 != kind)
10878 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879
10880 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010881
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882 return result;
10883}
10884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010885PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010886 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010887\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010888Encode S using the codec registered for encoding. Default encoding\n\
10889is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010890handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010891a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10892'xmlcharrefreplace' as well as any other name registered with\n\
10893codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010894
10895static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010896unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010898 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010899 char *encoding = NULL;
10900 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010901
Benjamin Peterson308d6372009-09-18 21:42:35 +000010902 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10903 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010904 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010905 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010906}
10907
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010908PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010909 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010910\n\
10911Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010912If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010913
10914static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010915unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010916{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010917 Py_ssize_t i, j, line_pos, src_len, incr;
10918 Py_UCS4 ch;
10919 PyObject *u;
10920 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010921 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010922 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010923 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010924
10925 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010926 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927
Antoine Pitrou22425222011-10-04 19:10:51 +020010928 if (PyUnicode_READY(self) == -1)
10929 return NULL;
10930
Thomas Wouters7e474022000-07-16 12:04:32 +000010931 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010932 src_len = PyUnicode_GET_LENGTH(self);
10933 i = j = line_pos = 0;
10934 kind = PyUnicode_KIND(self);
10935 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010936 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010937 for (; i < src_len; i++) {
10938 ch = PyUnicode_READ(kind, src_data, i);
10939 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010940 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010941 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010942 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010943 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010944 goto overflow;
10945 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010946 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010947 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010948 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010950 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010951 goto overflow;
10952 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010954 if (ch == '\n' || ch == '\r')
10955 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010957 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010958 if (!found)
10959 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010960
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010962 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963 if (!u)
10964 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010965 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966
Antoine Pitroue71d5742011-10-04 15:55:09 +020010967 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968
Antoine Pitroue71d5742011-10-04 15:55:09 +020010969 for (; i < src_len; i++) {
10970 ch = PyUnicode_READ(kind, src_data, i);
10971 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010972 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010973 incr = tabsize - (line_pos % tabsize);
10974 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010975 FILL(kind, dest_data, ' ', j, incr);
10976 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010977 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010978 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010979 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010980 line_pos++;
10981 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010982 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010983 if (ch == '\n' || ch == '\r')
10984 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010986 }
10987 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010988 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010989
Antoine Pitroue71d5742011-10-04 15:55:09 +020010990 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010991 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10992 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993}
10994
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010995PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010996 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997\n\
10998Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010999such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000arguments start and end are interpreted as in slice notation.\n\
11001\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011002Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003
11004static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011007 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011008 Py_ssize_t start;
11009 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011010 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011
Jesus Ceaac451502011-04-20 17:09:23 +020011012 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11013 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 if (PyUnicode_READY(self) == -1)
11017 return NULL;
11018 if (PyUnicode_READY(substring) == -1)
11019 return NULL;
11020
Victor Stinner7931d9a2011-11-04 00:22:48 +010011021 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011022
11023 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011025 if (result == -2)
11026 return NULL;
11027
Christian Heimes217cfd12007-12-02 14:31:20 +000011028 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029}
11030
11031static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011032unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011033{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011034 void *data;
11035 enum PyUnicode_Kind kind;
11036 Py_UCS4 ch;
11037 PyObject *res;
11038
11039 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11040 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011041 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011042 }
11043 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11044 PyErr_SetString(PyExc_IndexError, "string index out of range");
11045 return NULL;
11046 }
11047 kind = PyUnicode_KIND(self);
11048 data = PyUnicode_DATA(self);
11049 ch = PyUnicode_READ(kind, data, index);
11050 if (ch < 256)
11051 return get_latin1_char(ch);
11052
11053 res = PyUnicode_New(1, ch);
11054 if (res == NULL)
11055 return NULL;
11056 kind = PyUnicode_KIND(res);
11057 data = PyUnicode_DATA(res);
11058 PyUnicode_WRITE(kind, data, 0, ch);
11059 assert(_PyUnicode_CheckConsistency(res, 1));
11060 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061}
11062
Guido van Rossumc2504932007-09-18 19:42:40 +000011063/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011064 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011065static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011066unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067{
Guido van Rossumc2504932007-09-18 19:42:40 +000011068 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011069 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011070
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011071#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011072 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011073#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011074 if (_PyUnicode_HASH(self) != -1)
11075 return _PyUnicode_HASH(self);
11076 if (PyUnicode_READY(self) == -1)
11077 return -1;
11078 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011079 /*
11080 We make the hash of the empty string be 0, rather than using
11081 (prefix ^ suffix), since this slightly obfuscates the hash secret
11082 */
11083 if (len == 0) {
11084 _PyUnicode_HASH(self) = 0;
11085 return 0;
11086 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087
11088 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011089#define HASH(P) \
11090 x ^= (Py_uhash_t) *P << 7; \
11091 while (--len >= 0) \
11092 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093
Georg Brandl2fb477c2012-02-21 00:33:36 +010011094 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 switch (PyUnicode_KIND(self)) {
11096 case PyUnicode_1BYTE_KIND: {
11097 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11098 HASH(c);
11099 break;
11100 }
11101 case PyUnicode_2BYTE_KIND: {
11102 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11103 HASH(s);
11104 break;
11105 }
11106 default: {
11107 Py_UCS4 *l;
11108 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11109 "Impossible switch case in unicode_hash");
11110 l = PyUnicode_4BYTE_DATA(self);
11111 HASH(l);
11112 break;
11113 }
11114 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011115 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11116 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011117
Guido van Rossumc2504932007-09-18 19:42:40 +000011118 if (x == -1)
11119 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011120 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011121 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011123#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011125PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011126 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011127\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011128Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129
11130static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011131unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011133 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011134 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011135 Py_ssize_t start;
11136 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137
Jesus Ceaac451502011-04-20 17:09:23 +020011138 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11139 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011142 if (PyUnicode_READY(self) == -1)
11143 return NULL;
11144 if (PyUnicode_READY(substring) == -1)
11145 return NULL;
11146
Victor Stinner7931d9a2011-11-04 00:22:48 +010011147 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148
11149 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 if (result == -2)
11152 return NULL;
11153
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154 if (result < 0) {
11155 PyErr_SetString(PyExc_ValueError, "substring not found");
11156 return NULL;
11157 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011158
Christian Heimes217cfd12007-12-02 14:31:20 +000011159 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011160}
11161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011162PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011163 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011165Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011166at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167
11168static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011169unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 Py_ssize_t i, length;
11172 int kind;
11173 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174 int cased;
11175
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011176 if (PyUnicode_READY(self) == -1)
11177 return NULL;
11178 length = PyUnicode_GET_LENGTH(self);
11179 kind = PyUnicode_KIND(self);
11180 data = PyUnicode_DATA(self);
11181
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011183 if (length == 1)
11184 return PyBool_FromLong(
11185 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011187 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011188 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011189 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011190
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011192 for (i = 0; i < length; i++) {
11193 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011194
Benjamin Peterson29060642009-01-31 22:14:21 +000011195 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11196 return PyBool_FromLong(0);
11197 else if (!cased && Py_UNICODE_ISLOWER(ch))
11198 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011200 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201}
11202
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011203PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011204 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011206Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011207at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208
11209static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011210unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011212 Py_ssize_t i, length;
11213 int kind;
11214 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215 int cased;
11216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011217 if (PyUnicode_READY(self) == -1)
11218 return NULL;
11219 length = PyUnicode_GET_LENGTH(self);
11220 kind = PyUnicode_KIND(self);
11221 data = PyUnicode_DATA(self);
11222
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011224 if (length == 1)
11225 return PyBool_FromLong(
11226 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011228 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011229 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011230 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011231
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011233 for (i = 0; i < length; i++) {
11234 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011235
Benjamin Peterson29060642009-01-31 22:14:21 +000011236 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11237 return PyBool_FromLong(0);
11238 else if (!cased && Py_UNICODE_ISUPPER(ch))
11239 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011241 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242}
11243
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011244PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011245 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011246\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011247Return True if S is a titlecased string and there is at least one\n\
11248character in S, i.e. upper- and titlecase characters may only\n\
11249follow uncased characters and lowercase characters only cased ones.\n\
11250Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251
11252static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011253unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011255 Py_ssize_t i, length;
11256 int kind;
11257 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258 int cased, previous_is_cased;
11259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260 if (PyUnicode_READY(self) == -1)
11261 return NULL;
11262 length = PyUnicode_GET_LENGTH(self);
11263 kind = PyUnicode_KIND(self);
11264 data = PyUnicode_DATA(self);
11265
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267 if (length == 1) {
11268 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11269 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11270 (Py_UNICODE_ISUPPER(ch) != 0));
11271 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011273 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011274 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011275 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011276
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277 cased = 0;
11278 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011279 for (i = 0; i < length; i++) {
11280 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011281
Benjamin Peterson29060642009-01-31 22:14:21 +000011282 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11283 if (previous_is_cased)
11284 return PyBool_FromLong(0);
11285 previous_is_cased = 1;
11286 cased = 1;
11287 }
11288 else if (Py_UNICODE_ISLOWER(ch)) {
11289 if (!previous_is_cased)
11290 return PyBool_FromLong(0);
11291 previous_is_cased = 1;
11292 cased = 1;
11293 }
11294 else
11295 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011296 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011297 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298}
11299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011300PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011301 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011302\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011303Return True if all characters in S are whitespace\n\
11304and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305
11306static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011307unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011308{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 Py_ssize_t i, length;
11310 int kind;
11311 void *data;
11312
11313 if (PyUnicode_READY(self) == -1)
11314 return NULL;
11315 length = PyUnicode_GET_LENGTH(self);
11316 kind = PyUnicode_KIND(self);
11317 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320 if (length == 1)
11321 return PyBool_FromLong(
11322 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011324 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011326 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328 for (i = 0; i < length; i++) {
11329 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011330 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011331 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011333 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334}
11335
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011336PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011337 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011338\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011339Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011340and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011341
11342static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011343unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011344{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 Py_ssize_t i, length;
11346 int kind;
11347 void *data;
11348
11349 if (PyUnicode_READY(self) == -1)
11350 return NULL;
11351 length = PyUnicode_GET_LENGTH(self);
11352 kind = PyUnicode_KIND(self);
11353 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011354
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011355 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 if (length == 1)
11357 return PyBool_FromLong(
11358 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011359
11360 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011361 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011362 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011363
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364 for (i = 0; i < length; i++) {
11365 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011366 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011367 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011368 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011369}
11370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011371PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011372 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011373\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011374Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011375and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011376
11377static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011378unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011379{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380 int kind;
11381 void *data;
11382 Py_ssize_t len, i;
11383
11384 if (PyUnicode_READY(self) == -1)
11385 return NULL;
11386
11387 kind = PyUnicode_KIND(self);
11388 data = PyUnicode_DATA(self);
11389 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011390
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011391 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 if (len == 1) {
11393 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11394 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11395 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011396
11397 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011398 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011399 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011401 for (i = 0; i < len; i++) {
11402 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011403 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011404 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011405 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011406 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011407}
11408
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011409PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011410 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011412Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011413False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414
11415static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011416unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011418 Py_ssize_t i, length;
11419 int kind;
11420 void *data;
11421
11422 if (PyUnicode_READY(self) == -1)
11423 return NULL;
11424 length = PyUnicode_GET_LENGTH(self);
11425 kind = PyUnicode_KIND(self);
11426 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011429 if (length == 1)
11430 return PyBool_FromLong(
11431 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011433 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011434 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011435 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011437 for (i = 0; i < length; i++) {
11438 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011439 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011441 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442}
11443
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011444PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011445 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011447Return True if all characters in S are digits\n\
11448and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449
11450static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011451unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 Py_ssize_t i, length;
11454 int kind;
11455 void *data;
11456
11457 if (PyUnicode_READY(self) == -1)
11458 return NULL;
11459 length = PyUnicode_GET_LENGTH(self);
11460 kind = PyUnicode_KIND(self);
11461 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011462
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 if (length == 1) {
11465 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11466 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11467 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011469 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011471 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011473 for (i = 0; i < length; i++) {
11474 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011475 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011477 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478}
11479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011480PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011481 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011483Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011484False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485
11486static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011487unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 Py_ssize_t i, length;
11490 int kind;
11491 void *data;
11492
11493 if (PyUnicode_READY(self) == -1)
11494 return NULL;
11495 length = PyUnicode_GET_LENGTH(self);
11496 kind = PyUnicode_KIND(self);
11497 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011500 if (length == 1)
11501 return PyBool_FromLong(
11502 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011504 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011506 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011508 for (i = 0; i < length; i++) {
11509 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011510 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011512 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513}
11514
Martin v. Löwis47383402007-08-15 07:32:56 +000011515int
11516PyUnicode_IsIdentifier(PyObject *self)
11517{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 int kind;
11519 void *data;
11520 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011521 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011523 if (PyUnicode_READY(self) == -1) {
11524 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011525 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526 }
11527
11528 /* Special case for empty strings */
11529 if (PyUnicode_GET_LENGTH(self) == 0)
11530 return 0;
11531 kind = PyUnicode_KIND(self);
11532 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011533
11534 /* PEP 3131 says that the first character must be in
11535 XID_Start and subsequent characters in XID_Continue,
11536 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011537 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011538 letters, digits, underscore). However, given the current
11539 definition of XID_Start and XID_Continue, it is sufficient
11540 to check just for these, except that _ must be allowed
11541 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011542 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011543 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011544 return 0;
11545
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011546 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011548 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011549 return 1;
11550}
11551
11552PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011553 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011554\n\
11555Return True if S is a valid identifier according\n\
11556to the language definition.");
11557
11558static PyObject*
11559unicode_isidentifier(PyObject *self)
11560{
11561 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11562}
11563
Georg Brandl559e5d72008-06-11 18:37:52 +000011564PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011565 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011566\n\
11567Return True if all characters in S are considered\n\
11568printable in repr() or S is empty, False otherwise.");
11569
11570static PyObject*
11571unicode_isprintable(PyObject *self)
11572{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011573 Py_ssize_t i, length;
11574 int kind;
11575 void *data;
11576
11577 if (PyUnicode_READY(self) == -1)
11578 return NULL;
11579 length = PyUnicode_GET_LENGTH(self);
11580 kind = PyUnicode_KIND(self);
11581 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011582
11583 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584 if (length == 1)
11585 return PyBool_FromLong(
11586 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011588 for (i = 0; i < length; i++) {
11589 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011590 Py_RETURN_FALSE;
11591 }
11592 }
11593 Py_RETURN_TRUE;
11594}
11595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011596PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011597 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598\n\
11599Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011600iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601
11602static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011603unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011605 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606}
11607
Martin v. Löwis18e16552006-02-15 17:27:45 +000011608static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011609unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011611 if (PyUnicode_READY(self) == -1)
11612 return -1;
11613 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614}
11615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011616PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011617 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011619Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011620done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621
11622static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011623unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011625 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 Py_UCS4 fillchar = ' ';
11627
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011628 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629 return NULL;
11630
Benjamin Petersonbac79492012-01-14 13:34:47 -050011631 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633
Victor Stinnerc4b49542011-12-11 22:44:26 +010011634 if (PyUnicode_GET_LENGTH(self) >= width)
11635 return unicode_result_unchanged(self);
11636
11637 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638}
11639
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011640PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011641 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011643Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644
11645static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011646unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011648 if (PyUnicode_READY(self) == -1)
11649 return NULL;
11650 if (PyUnicode_IS_ASCII(self))
11651 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011652 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653}
11654
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011655#define LEFTSTRIP 0
11656#define RIGHTSTRIP 1
11657#define BOTHSTRIP 2
11658
11659/* Arrays indexed by above */
11660static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11661
11662#define STRIPNAME(i) (stripformat[i]+3)
11663
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011664/* externally visible for str.strip(unicode) */
11665PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011666_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011667{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011668 void *data;
11669 int kind;
11670 Py_ssize_t i, j, len;
11671 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11674 return NULL;
11675
11676 kind = PyUnicode_KIND(self);
11677 data = PyUnicode_DATA(self);
11678 len = PyUnicode_GET_LENGTH(self);
11679 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11680 PyUnicode_DATA(sepobj),
11681 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011682
Benjamin Peterson14339b62009-01-31 16:36:08 +000011683 i = 0;
11684 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 while (i < len &&
11686 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011687 i++;
11688 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011689 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011690
Benjamin Peterson14339b62009-01-31 16:36:08 +000011691 j = len;
11692 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011693 do {
11694 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 } while (j >= i &&
11696 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011697 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011698 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011699
Victor Stinner7931d9a2011-11-04 00:22:48 +010011700 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701}
11702
11703PyObject*
11704PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11705{
11706 unsigned char *data;
11707 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011708 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709
Victor Stinnerde636f32011-10-01 03:55:54 +020011710 if (PyUnicode_READY(self) == -1)
11711 return NULL;
11712
Victor Stinner684d5fd2012-05-03 02:32:34 +020011713 length = PyUnicode_GET_LENGTH(self);
11714 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011715
Victor Stinner684d5fd2012-05-03 02:32:34 +020011716 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011717 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011718
Victor Stinnerde636f32011-10-01 03:55:54 +020011719 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011720 PyErr_SetString(PyExc_IndexError, "string index out of range");
11721 return NULL;
11722 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020011723 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020011724 Py_INCREF(unicode_empty);
11725 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020011726 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020011727
Victor Stinner684d5fd2012-05-03 02:32:34 +020011728 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011729 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011730 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011731 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011732 }
11733 else {
11734 kind = PyUnicode_KIND(self);
11735 data = PyUnicode_1BYTE_DATA(self);
11736 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011737 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011738 length);
11739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741
11742static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011743do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011745 int kind;
11746 void *data;
11747 Py_ssize_t len, i, j;
11748
11749 if (PyUnicode_READY(self) == -1)
11750 return NULL;
11751
11752 kind = PyUnicode_KIND(self);
11753 data = PyUnicode_DATA(self);
11754 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011755
Benjamin Peterson14339b62009-01-31 16:36:08 +000011756 i = 0;
11757 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011759 i++;
11760 }
11761 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011762
Benjamin Peterson14339b62009-01-31 16:36:08 +000011763 j = len;
11764 if (striptype != LEFTSTRIP) {
11765 do {
11766 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011768 j++;
11769 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011770
Victor Stinner7931d9a2011-11-04 00:22:48 +010011771 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772}
11773
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011774
11775static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011776do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011777{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011778 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011779
Benjamin Peterson14339b62009-01-31 16:36:08 +000011780 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11781 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011782
Benjamin Peterson14339b62009-01-31 16:36:08 +000011783 if (sep != NULL && sep != Py_None) {
11784 if (PyUnicode_Check(sep))
11785 return _PyUnicode_XStrip(self, striptype, sep);
11786 else {
11787 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011788 "%s arg must be None or str",
11789 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011790 return NULL;
11791 }
11792 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011793
Benjamin Peterson14339b62009-01-31 16:36:08 +000011794 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011795}
11796
11797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011798PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011799 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011800\n\
11801Return a copy of the string S with leading and trailing\n\
11802whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011803If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011804
11805static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011806unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011807{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011808 if (PyTuple_GET_SIZE(args) == 0)
11809 return do_strip(self, BOTHSTRIP); /* Common case */
11810 else
11811 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011812}
11813
11814
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011815PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011816 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011817\n\
11818Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011819If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011820
11821static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011822unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011823{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011824 if (PyTuple_GET_SIZE(args) == 0)
11825 return do_strip(self, LEFTSTRIP); /* Common case */
11826 else
11827 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011828}
11829
11830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011831PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011832 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011833\n\
11834Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011835If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011836
11837static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011838unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011839{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011840 if (PyTuple_GET_SIZE(args) == 0)
11841 return do_strip(self, RIGHTSTRIP); /* Common case */
11842 else
11843 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011844}
11845
11846
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011848unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011850 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011851 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852
Georg Brandl222de0f2009-04-12 12:01:50 +000011853 if (len < 1) {
11854 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011855 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857
Victor Stinnerc4b49542011-12-11 22:44:26 +010011858 /* no repeat, return original string */
11859 if (len == 1)
11860 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011861
Benjamin Petersonbac79492012-01-14 13:34:47 -050011862 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011863 return NULL;
11864
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011865 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011866 PyErr_SetString(PyExc_OverflowError,
11867 "repeated string is too long");
11868 return NULL;
11869 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011870 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011871
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011872 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873 if (!u)
11874 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011875 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 if (PyUnicode_GET_LENGTH(str) == 1) {
11878 const int kind = PyUnicode_KIND(str);
11879 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011880 if (kind == PyUnicode_1BYTE_KIND) {
11881 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011882 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011883 }
11884 else if (kind == PyUnicode_2BYTE_KIND) {
11885 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011886 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011887 ucs2[n] = fill_char;
11888 } else {
11889 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11890 assert(kind == PyUnicode_4BYTE_KIND);
11891 for (n = 0; n < len; ++n)
11892 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 }
11895 else {
11896 /* number of characters copied this far */
11897 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011898 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 char *to = (char *) PyUnicode_DATA(u);
11900 Py_MEMCPY(to, PyUnicode_DATA(str),
11901 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011902 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 n = (done <= nchars-done) ? done : nchars-done;
11904 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011905 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907 }
11908
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011909 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011910 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911}
11912
Alexander Belopolsky40018472011-02-26 01:02:56 +000011913PyObject *
11914PyUnicode_Replace(PyObject *obj,
11915 PyObject *subobj,
11916 PyObject *replobj,
11917 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918{
11919 PyObject *self;
11920 PyObject *str1;
11921 PyObject *str2;
11922 PyObject *result;
11923
11924 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011925 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011926 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011928 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 Py_DECREF(self);
11930 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931 }
11932 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011933 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011934 Py_DECREF(self);
11935 Py_DECREF(str1);
11936 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011938 if (PyUnicode_READY(self) == -1 ||
11939 PyUnicode_READY(str1) == -1 ||
11940 PyUnicode_READY(str2) == -1)
11941 result = NULL;
11942 else
11943 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944 Py_DECREF(self);
11945 Py_DECREF(str1);
11946 Py_DECREF(str2);
11947 return result;
11948}
11949
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011950PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011951 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011952\n\
11953Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011954old replaced by new. If the optional argument count is\n\
11955given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956
11957static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011960 PyObject *str1;
11961 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011962 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963 PyObject *result;
11964
Martin v. Löwis18e16552006-02-15 17:27:45 +000011965 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011967 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011968 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011970 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 return NULL;
11972 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011973 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011974 Py_DECREF(str1);
11975 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011976 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011977 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11978 result = NULL;
11979 else
11980 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981
11982 Py_DECREF(str1);
11983 Py_DECREF(str2);
11984 return result;
11985}
11986
Alexander Belopolsky40018472011-02-26 01:02:56 +000011987static PyObject *
11988unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011990 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 Py_ssize_t isize;
11992 Py_ssize_t osize, squote, dquote, i, o;
11993 Py_UCS4 max, quote;
11994 int ikind, okind;
11995 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011998 return NULL;
11999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 isize = PyUnicode_GET_LENGTH(unicode);
12001 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 /* Compute length of output, quote characters, and
12004 maximum character */
12005 osize = 2; /* quotes */
12006 max = 127;
12007 squote = dquote = 0;
12008 ikind = PyUnicode_KIND(unicode);
12009 for (i = 0; i < isize; i++) {
12010 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12011 switch (ch) {
12012 case '\'': squote++; osize++; break;
12013 case '"': dquote++; osize++; break;
12014 case '\\': case '\t': case '\r': case '\n':
12015 osize += 2; break;
12016 default:
12017 /* Fast-path ASCII */
12018 if (ch < ' ' || ch == 0x7f)
12019 osize += 4; /* \xHH */
12020 else if (ch < 0x7f)
12021 osize++;
12022 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12023 osize++;
12024 max = ch > max ? ch : max;
12025 }
12026 else if (ch < 0x100)
12027 osize += 4; /* \xHH */
12028 else if (ch < 0x10000)
12029 osize += 6; /* \uHHHH */
12030 else
12031 osize += 10; /* \uHHHHHHHH */
12032 }
12033 }
12034
12035 quote = '\'';
12036 if (squote) {
12037 if (dquote)
12038 /* Both squote and dquote present. Use squote,
12039 and escape them */
12040 osize += squote;
12041 else
12042 quote = '"';
12043 }
12044
12045 repr = PyUnicode_New(osize, max);
12046 if (repr == NULL)
12047 return NULL;
12048 okind = PyUnicode_KIND(repr);
12049 odata = PyUnicode_DATA(repr);
12050
12051 PyUnicode_WRITE(okind, odata, 0, quote);
12052 PyUnicode_WRITE(okind, odata, osize-1, quote);
12053
12054 for (i = 0, o = 1; i < isize; i++) {
12055 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012056
12057 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 if ((ch == quote) || (ch == '\\')) {
12059 PyUnicode_WRITE(okind, odata, o++, '\\');
12060 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012061 continue;
12062 }
12063
Benjamin Peterson29060642009-01-31 22:14:21 +000012064 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012065 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 PyUnicode_WRITE(okind, odata, o++, '\\');
12067 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012068 }
12069 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 PyUnicode_WRITE(okind, odata, o++, '\\');
12071 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012072 }
12073 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012074 PyUnicode_WRITE(okind, odata, o++, '\\');
12075 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012076 }
12077
12078 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012079 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012080 PyUnicode_WRITE(okind, odata, o++, '\\');
12081 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012082 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12083 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012084 }
12085
Georg Brandl559e5d72008-06-11 18:37:52 +000012086 /* Copy ASCII characters as-is */
12087 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012089 }
12090
Benjamin Peterson29060642009-01-31 22:14:21 +000012091 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012092 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012093 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012094 (categories Z* and C* except ASCII space)
12095 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012097 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000012098 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012101 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12102 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012103 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012104 /* Map 16-bit characters to '\uxxxx' */
12105 else if (ch <= 0xffff) {
12106 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012107 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12108 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12109 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12110 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012111 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012112 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012113 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012114 PyUnicode_WRITE(okind, odata, o++, 'U');
12115 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12116 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12117 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12118 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020012119 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12120 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12121 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12122 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012123 }
12124 }
12125 /* Copy characters as-is */
12126 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012128 }
12129 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012130 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012132 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012133 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134}
12135
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012136PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012137 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012138\n\
12139Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012140such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141arguments start and end are interpreted as in slice notation.\n\
12142\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012143Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144
12145static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012146unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012148 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012149 Py_ssize_t start;
12150 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012151 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152
Jesus Ceaac451502011-04-20 17:09:23 +020012153 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12154 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012155 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012157 if (PyUnicode_READY(self) == -1)
12158 return NULL;
12159 if (PyUnicode_READY(substring) == -1)
12160 return NULL;
12161
Victor Stinner7931d9a2011-11-04 00:22:48 +010012162 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163
12164 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012166 if (result == -2)
12167 return NULL;
12168
Christian Heimes217cfd12007-12-02 14:31:20 +000012169 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170}
12171
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012172PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012173 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012175Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176
12177static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012180 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012181 Py_ssize_t start;
12182 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012183 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184
Jesus Ceaac451502011-04-20 17:09:23 +020012185 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12186 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012187 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 if (PyUnicode_READY(self) == -1)
12190 return NULL;
12191 if (PyUnicode_READY(substring) == -1)
12192 return NULL;
12193
Victor Stinner7931d9a2011-11-04 00:22:48 +010012194 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195
12196 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012198 if (result == -2)
12199 return NULL;
12200
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201 if (result < 0) {
12202 PyErr_SetString(PyExc_ValueError, "substring not found");
12203 return NULL;
12204 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205
Christian Heimes217cfd12007-12-02 14:31:20 +000012206 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207}
12208
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012209PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012210 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012212Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012213done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214
12215static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012216unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012218 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 Py_UCS4 fillchar = ' ';
12220
Victor Stinnere9a29352011-10-01 02:14:59 +020012221 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012223
Benjamin Petersonbac79492012-01-14 13:34:47 -050012224 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225 return NULL;
12226
Victor Stinnerc4b49542011-12-11 22:44:26 +010012227 if (PyUnicode_GET_LENGTH(self) >= width)
12228 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229
Victor Stinnerc4b49542011-12-11 22:44:26 +010012230 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231}
12232
Alexander Belopolsky40018472011-02-26 01:02:56 +000012233PyObject *
12234PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235{
12236 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012237
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238 s = PyUnicode_FromObject(s);
12239 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012240 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012241 if (sep != NULL) {
12242 sep = PyUnicode_FromObject(sep);
12243 if (sep == NULL) {
12244 Py_DECREF(s);
12245 return NULL;
12246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247 }
12248
Victor Stinner9310abb2011-10-05 00:59:23 +020012249 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250
12251 Py_DECREF(s);
12252 Py_XDECREF(sep);
12253 return result;
12254}
12255
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012256PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012257 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258\n\
12259Return a list of the words in S, using sep as the\n\
12260delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012261splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012262whitespace string is a separator and empty strings are\n\
12263removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264
12265static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012266unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012268 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012270 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012272 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12273 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274 return NULL;
12275
12276 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012277 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012279 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012281 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282}
12283
Thomas Wouters477c8d52006-05-27 19:21:47 +000012284PyObject *
12285PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12286{
12287 PyObject* str_obj;
12288 PyObject* sep_obj;
12289 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 int kind1, kind2, kind;
12291 void *buf1 = NULL, *buf2 = NULL;
12292 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012293
12294 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012295 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012296 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012297 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012298 if (!sep_obj) {
12299 Py_DECREF(str_obj);
12300 return NULL;
12301 }
12302 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12303 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012304 Py_DECREF(str_obj);
12305 return NULL;
12306 }
12307
Victor Stinner14f8f022011-10-05 20:58:25 +020012308 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012310 kind = Py_MAX(kind1, kind2);
12311 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012313 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012314 if (!buf1)
12315 goto onError;
12316 buf2 = PyUnicode_DATA(sep_obj);
12317 if (kind2 != kind)
12318 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12319 if (!buf2)
12320 goto onError;
12321 len1 = PyUnicode_GET_LENGTH(str_obj);
12322 len2 = PyUnicode_GET_LENGTH(sep_obj);
12323
Benjamin Petersonead6b532011-12-20 17:23:42 -060012324 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012326 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12327 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12328 else
12329 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 break;
12331 case PyUnicode_2BYTE_KIND:
12332 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12333 break;
12334 case PyUnicode_4BYTE_KIND:
12335 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12336 break;
12337 default:
12338 assert(0);
12339 out = 0;
12340 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012341
12342 Py_DECREF(sep_obj);
12343 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 if (kind1 != kind)
12345 PyMem_Free(buf1);
12346 if (kind2 != kind)
12347 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012348
12349 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 onError:
12351 Py_DECREF(sep_obj);
12352 Py_DECREF(str_obj);
12353 if (kind1 != kind && buf1)
12354 PyMem_Free(buf1);
12355 if (kind2 != kind && buf2)
12356 PyMem_Free(buf2);
12357 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012358}
12359
12360
12361PyObject *
12362PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12363{
12364 PyObject* str_obj;
12365 PyObject* sep_obj;
12366 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 int kind1, kind2, kind;
12368 void *buf1 = NULL, *buf2 = NULL;
12369 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012370
12371 str_obj = PyUnicode_FromObject(str_in);
12372 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012373 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012374 sep_obj = PyUnicode_FromObject(sep_in);
12375 if (!sep_obj) {
12376 Py_DECREF(str_obj);
12377 return NULL;
12378 }
12379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380 kind1 = PyUnicode_KIND(str_in);
12381 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012382 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012383 buf1 = PyUnicode_DATA(str_in);
12384 if (kind1 != kind)
12385 buf1 = _PyUnicode_AsKind(str_in, kind);
12386 if (!buf1)
12387 goto onError;
12388 buf2 = PyUnicode_DATA(sep_obj);
12389 if (kind2 != kind)
12390 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12391 if (!buf2)
12392 goto onError;
12393 len1 = PyUnicode_GET_LENGTH(str_obj);
12394 len2 = PyUnicode_GET_LENGTH(sep_obj);
12395
Benjamin Petersonead6b532011-12-20 17:23:42 -060012396 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012398 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12399 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12400 else
12401 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012402 break;
12403 case PyUnicode_2BYTE_KIND:
12404 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12405 break;
12406 case PyUnicode_4BYTE_KIND:
12407 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12408 break;
12409 default:
12410 assert(0);
12411 out = 0;
12412 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012413
12414 Py_DECREF(sep_obj);
12415 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012416 if (kind1 != kind)
12417 PyMem_Free(buf1);
12418 if (kind2 != kind)
12419 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012420
12421 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012422 onError:
12423 Py_DECREF(sep_obj);
12424 Py_DECREF(str_obj);
12425 if (kind1 != kind && buf1)
12426 PyMem_Free(buf1);
12427 if (kind2 != kind && buf2)
12428 PyMem_Free(buf2);
12429 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012430}
12431
12432PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012433 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012434\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012435Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012436the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012437found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012438
12439static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012440unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012441{
Victor Stinner9310abb2011-10-05 00:59:23 +020012442 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012443}
12444
12445PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012446 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012447\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012448Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012449the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012450separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012451
12452static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012453unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012454{
Victor Stinner9310abb2011-10-05 00:59:23 +020012455 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012456}
12457
Alexander Belopolsky40018472011-02-26 01:02:56 +000012458PyObject *
12459PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012460{
12461 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012462
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012463 s = PyUnicode_FromObject(s);
12464 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012465 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012466 if (sep != NULL) {
12467 sep = PyUnicode_FromObject(sep);
12468 if (sep == NULL) {
12469 Py_DECREF(s);
12470 return NULL;
12471 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012472 }
12473
Victor Stinner9310abb2011-10-05 00:59:23 +020012474 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012475
12476 Py_DECREF(s);
12477 Py_XDECREF(sep);
12478 return result;
12479}
12480
12481PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012482 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012483\n\
12484Return a list of the words in S, using sep as the\n\
12485delimiter string, starting at the end of the string and\n\
12486working to the front. If maxsplit is given, at most maxsplit\n\
12487splits are done. If sep is not specified, any whitespace string\n\
12488is a separator.");
12489
12490static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012491unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012492{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012493 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012494 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012495 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012496
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012497 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12498 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012499 return NULL;
12500
12501 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012502 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012503 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012504 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012505 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012506 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012507}
12508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012509PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012510 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511\n\
12512Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012513Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012514is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515
12516static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012517unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012519 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012520 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012522 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12523 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524 return NULL;
12525
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012526 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527}
12528
12529static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012530PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012532 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533}
12534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012535PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012536 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537\n\
12538Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012539and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540
12541static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012542unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012544 if (PyUnicode_READY(self) == -1)
12545 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012546 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547}
12548
Georg Brandlceee0772007-11-27 23:48:05 +000012549PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012550 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012551\n\
12552Return a translation table usable for str.translate().\n\
12553If there is only one argument, it must be a dictionary mapping Unicode\n\
12554ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012555Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012556If there are two arguments, they must be strings of equal length, and\n\
12557in the resulting dictionary, each character in x will be mapped to the\n\
12558character at the same position in y. If there is a third argument, it\n\
12559must be a string, whose characters will be mapped to None in the result.");
12560
12561static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012562unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012563{
12564 PyObject *x, *y = NULL, *z = NULL;
12565 PyObject *new = NULL, *key, *value;
12566 Py_ssize_t i = 0;
12567 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012568
Georg Brandlceee0772007-11-27 23:48:05 +000012569 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12570 return NULL;
12571 new = PyDict_New();
12572 if (!new)
12573 return NULL;
12574 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 int x_kind, y_kind, z_kind;
12576 void *x_data, *y_data, *z_data;
12577
Georg Brandlceee0772007-11-27 23:48:05 +000012578 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012579 if (!PyUnicode_Check(x)) {
12580 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12581 "be a string if there is a second argument");
12582 goto err;
12583 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012585 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12586 "arguments must have equal length");
12587 goto err;
12588 }
12589 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012590 x_kind = PyUnicode_KIND(x);
12591 y_kind = PyUnicode_KIND(y);
12592 x_data = PyUnicode_DATA(x);
12593 y_data = PyUnicode_DATA(y);
12594 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12595 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012596 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012597 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012598 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012599 if (!value) {
12600 Py_DECREF(key);
12601 goto err;
12602 }
Georg Brandlceee0772007-11-27 23:48:05 +000012603 res = PyDict_SetItem(new, key, value);
12604 Py_DECREF(key);
12605 Py_DECREF(value);
12606 if (res < 0)
12607 goto err;
12608 }
12609 /* create entries for deleting chars in z */
12610 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611 z_kind = PyUnicode_KIND(z);
12612 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012613 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012614 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012615 if (!key)
12616 goto err;
12617 res = PyDict_SetItem(new, key, Py_None);
12618 Py_DECREF(key);
12619 if (res < 0)
12620 goto err;
12621 }
12622 }
12623 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012624 int kind;
12625 void *data;
12626
Georg Brandlceee0772007-11-27 23:48:05 +000012627 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012628 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012629 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12630 "to maketrans it must be a dict");
12631 goto err;
12632 }
12633 /* copy entries into the new dict, converting string keys to int keys */
12634 while (PyDict_Next(x, &i, &key, &value)) {
12635 if (PyUnicode_Check(key)) {
12636 /* convert string keys to integer keys */
12637 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012638 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012639 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12640 "table must be of length 1");
12641 goto err;
12642 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 kind = PyUnicode_KIND(key);
12644 data = PyUnicode_DATA(key);
12645 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012646 if (!newkey)
12647 goto err;
12648 res = PyDict_SetItem(new, newkey, value);
12649 Py_DECREF(newkey);
12650 if (res < 0)
12651 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012652 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012653 /* just keep integer keys */
12654 if (PyDict_SetItem(new, key, value) < 0)
12655 goto err;
12656 } else {
12657 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12658 "be strings or integers");
12659 goto err;
12660 }
12661 }
12662 }
12663 return new;
12664 err:
12665 Py_DECREF(new);
12666 return NULL;
12667}
12668
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012669PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012670 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671\n\
12672Return a copy of the string S, where all characters have been mapped\n\
12673through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012674Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012675Unmapped characters are left untouched. Characters mapped to None\n\
12676are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677
12678static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012679unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682}
12683
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012684PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012685 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012687Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688
12689static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012690unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012692 if (PyUnicode_READY(self) == -1)
12693 return NULL;
12694 if (PyUnicode_IS_ASCII(self))
12695 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012696 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697}
12698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012699PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012700 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012702Pad a numeric string S with zeros on the left, to fill a field\n\
12703of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704
12705static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012706unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012707{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012708 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012709 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012710 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 int kind;
12712 void *data;
12713 Py_UCS4 chr;
12714
Martin v. Löwis18e16552006-02-15 17:27:45 +000012715 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012716 return NULL;
12717
Benjamin Petersonbac79492012-01-14 13:34:47 -050012718 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012719 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720
Victor Stinnerc4b49542011-12-11 22:44:26 +010012721 if (PyUnicode_GET_LENGTH(self) >= width)
12722 return unicode_result_unchanged(self);
12723
12724 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012725
12726 u = pad(self, fill, 0, '0');
12727
Walter Dörwald068325e2002-04-15 13:36:47 +000012728 if (u == NULL)
12729 return NULL;
12730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012731 kind = PyUnicode_KIND(u);
12732 data = PyUnicode_DATA(u);
12733 chr = PyUnicode_READ(kind, data, fill);
12734
12735 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012736 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012737 PyUnicode_WRITE(kind, data, 0, chr);
12738 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012739 }
12740
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012741 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012742 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012744
12745#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012746static PyObject *
12747unicode__decimal2ascii(PyObject *self)
12748{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012749 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012750}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012751#endif
12752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012753PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012754 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012755\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012756Return True if S starts with the specified prefix, False otherwise.\n\
12757With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012758With optional end, stop comparing S at that position.\n\
12759prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012760
12761static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012762unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012763 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012765 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012766 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012767 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012768 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012769 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012770
Jesus Ceaac451502011-04-20 17:09:23 +020012771 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012772 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012773 if (PyTuple_Check(subobj)) {
12774 Py_ssize_t i;
12775 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012776 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012777 if (substring == NULL)
12778 return NULL;
12779 result = tailmatch(self, substring, start, end, -1);
12780 Py_DECREF(substring);
12781 if (result) {
12782 Py_RETURN_TRUE;
12783 }
12784 }
12785 /* nothing matched */
12786 Py_RETURN_FALSE;
12787 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012788 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012789 if (substring == NULL) {
12790 if (PyErr_ExceptionMatches(PyExc_TypeError))
12791 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12792 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012793 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012794 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012795 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012797 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012798}
12799
12800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012801PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012802 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012803\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012804Return True if S ends with the specified suffix, False otherwise.\n\
12805With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012806With optional end, stop comparing S at that position.\n\
12807suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808
12809static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012810unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012811 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012813 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012814 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012815 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012816 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012817 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818
Jesus Ceaac451502011-04-20 17:09:23 +020012819 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012820 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012821 if (PyTuple_Check(subobj)) {
12822 Py_ssize_t i;
12823 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012824 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012825 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012826 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012827 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012828 result = tailmatch(self, substring, start, end, +1);
12829 Py_DECREF(substring);
12830 if (result) {
12831 Py_RETURN_TRUE;
12832 }
12833 }
12834 Py_RETURN_FALSE;
12835 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012836 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012837 if (substring == NULL) {
12838 if (PyErr_ExceptionMatches(PyExc_TypeError))
12839 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12840 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012841 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012842 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012843 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012844 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012845 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012846}
12847
Victor Stinner202fdca2012-05-07 12:47:02 +020012848Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012849_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012850{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012851 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012852 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12853 writer->data = PyUnicode_DATA(writer->buffer);
12854 writer->kind = PyUnicode_KIND(writer->buffer);
12855}
12856
Victor Stinnerd3f08822012-05-29 12:57:52 +020012857void
12858_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012859{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012860 memset(writer, 0, sizeof(*writer));
12861#ifdef Py_DEBUG
12862 writer->kind = 5; /* invalid kind */
12863#endif
12864 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012865 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012866}
12867
Victor Stinnerd3f08822012-05-29 12:57:52 +020012868int
12869_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12870 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012871{
12872 Py_ssize_t newlen;
12873 PyObject *newbuffer;
12874
Victor Stinnerd3f08822012-05-29 12:57:52 +020012875 assert(length > 0);
12876
Victor Stinner202fdca2012-05-07 12:47:02 +020012877 if (length > PY_SSIZE_T_MAX - writer->pos) {
12878 PyErr_NoMemory();
12879 return -1;
12880 }
12881 newlen = writer->pos + length;
12882
Victor Stinnerd3f08822012-05-29 12:57:52 +020012883 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012884 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012885 /* overallocate 25% to limit the number of resize */
12886 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12887 newlen += newlen / 4;
12888 if (newlen < writer->min_length)
12889 newlen = writer->min_length;
12890 }
12891 writer->buffer = PyUnicode_New(newlen, maxchar);
12892 if (writer->buffer == NULL)
12893 return -1;
12894 _PyUnicodeWriter_Update(writer);
12895 return 0;
12896 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012897
Victor Stinnerd3f08822012-05-29 12:57:52 +020012898 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012899 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012900 /* overallocate 25% to limit the number of resize */
12901 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12902 newlen += newlen / 4;
12903 if (newlen < writer->min_length)
12904 newlen = writer->min_length;
12905 }
12906
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012907 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012908 /* resize + widen */
12909 newbuffer = PyUnicode_New(newlen, maxchar);
12910 if (newbuffer == NULL)
12911 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012912 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12913 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012914 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012915 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012916 }
12917 else {
12918 newbuffer = resize_compact(writer->buffer, newlen);
12919 if (newbuffer == NULL)
12920 return -1;
12921 }
12922 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012923 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012924 }
12925 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012926 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012927 newbuffer = PyUnicode_New(writer->size, maxchar);
12928 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012929 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012930 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12931 writer->buffer, 0, writer->pos);
12932 Py_DECREF(writer->buffer);
12933 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012934 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012935 }
12936 return 0;
12937}
12938
Victor Stinnerd3f08822012-05-29 12:57:52 +020012939int
12940_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12941{
12942 Py_UCS4 maxchar;
12943 Py_ssize_t len;
12944
12945 if (PyUnicode_READY(str) == -1)
12946 return -1;
12947 len = PyUnicode_GET_LENGTH(str);
12948 if (len == 0)
12949 return 0;
12950 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12951 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012952 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012953 Py_INCREF(str);
12954 writer->buffer = str;
12955 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012956 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012957 writer->size = 0;
12958 writer->pos += len;
12959 return 0;
12960 }
12961 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12962 return -1;
12963 }
12964 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12965 str, 0, len);
12966 writer->pos += len;
12967 return 0;
12968}
12969
12970PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012971_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012972{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012973 if (writer->pos == 0) {
12974 Py_XDECREF(writer->buffer);
12975 Py_INCREF(unicode_empty);
12976 return unicode_empty;
12977 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012978 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012979 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12980 return writer->buffer;
12981 }
12982 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12983 PyObject *newbuffer;
12984 newbuffer = resize_compact(writer->buffer, writer->pos);
12985 if (newbuffer == NULL) {
12986 Py_DECREF(writer->buffer);
12987 return NULL;
12988 }
12989 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012990 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012991 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012992 return writer->buffer;
12993}
12994
Victor Stinnerd3f08822012-05-29 12:57:52 +020012995void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012996_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012997{
12998 Py_CLEAR(writer->buffer);
12999}
13000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013001#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013002
13003PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013004 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013005\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013006Return a formatted version of S, using substitutions from args and kwargs.\n\
13007The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013008
Eric Smith27bbca62010-11-04 17:06:58 +000013009PyDoc_STRVAR(format_map__doc__,
13010 "S.format_map(mapping) -> str\n\
13011\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013012Return a formatted version of S, using substitutions from mapping.\n\
13013The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013014
Eric Smith4a7d76d2008-05-30 18:10:19 +000013015static PyObject *
13016unicode__format__(PyObject* self, PyObject* args)
13017{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013018 PyObject *format_spec;
13019 _PyUnicodeWriter writer;
13020 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013021
13022 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13023 return NULL;
13024
Victor Stinnerd3f08822012-05-29 12:57:52 +020013025 if (PyUnicode_READY(self) == -1)
13026 return NULL;
13027 _PyUnicodeWriter_Init(&writer, 0);
13028 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13029 self, format_spec, 0,
13030 PyUnicode_GET_LENGTH(format_spec));
13031 if (ret == -1) {
13032 _PyUnicodeWriter_Dealloc(&writer);
13033 return NULL;
13034 }
13035 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013036}
13037
Eric Smith8c663262007-08-25 02:26:07 +000013038PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013039 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013040\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013041Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013042
13043static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013044unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013045{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013046 Py_ssize_t size;
13047
13048 /* If it's a compact object, account for base structure +
13049 character data. */
13050 if (PyUnicode_IS_COMPACT_ASCII(v))
13051 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13052 else if (PyUnicode_IS_COMPACT(v))
13053 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013054 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013055 else {
13056 /* If it is a two-block object, account for base object, and
13057 for character block if present. */
13058 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013059 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013060 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013061 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013062 }
13063 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013064 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013065 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013067 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013068 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013069
13070 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013071}
13072
13073PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013074 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013075
13076static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013077unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013078{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013079 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013080 if (!copy)
13081 return NULL;
13082 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013083}
13084
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013086 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013087 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013088 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13089 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013090 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13091 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013092 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013093 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13094 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13095 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13096 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13097 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013098 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013099 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13100 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13101 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013102 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013103 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13104 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13105 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013106 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013107 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013108 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013109 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013110 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13111 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13112 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13113 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13114 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13115 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13116 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13117 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13118 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13119 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13120 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13121 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13122 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13123 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013124 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013125 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013126 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013127 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013128 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013129 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013130 {"maketrans", (PyCFunction) unicode_maketrans,
13131 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013132 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013133#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013134 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013135 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136#endif
13137
Benjamin Peterson14339b62009-01-31 16:36:08 +000013138 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139 {NULL, NULL}
13140};
13141
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013142static PyObject *
13143unicode_mod(PyObject *v, PyObject *w)
13144{
Brian Curtindfc80e32011-08-10 20:28:54 -050013145 if (!PyUnicode_Check(v))
13146 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013147 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013148}
13149
13150static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013151 0, /*nb_add*/
13152 0, /*nb_subtract*/
13153 0, /*nb_multiply*/
13154 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013155};
13156
Guido van Rossumd57fd912000-03-10 22:53:23 +000013157static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013158 (lenfunc) unicode_length, /* sq_length */
13159 PyUnicode_Concat, /* sq_concat */
13160 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13161 (ssizeargfunc) unicode_getitem, /* sq_item */
13162 0, /* sq_slice */
13163 0, /* sq_ass_item */
13164 0, /* sq_ass_slice */
13165 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013166};
13167
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013168static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013169unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013170{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013171 if (PyUnicode_READY(self) == -1)
13172 return NULL;
13173
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013174 if (PyIndex_Check(item)) {
13175 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013176 if (i == -1 && PyErr_Occurred())
13177 return NULL;
13178 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013179 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013180 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013181 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013182 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013183 PyObject *result;
13184 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013185 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013186 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013188 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013189 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013190 return NULL;
13191 }
13192
13193 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013194 Py_INCREF(unicode_empty);
13195 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013196 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013197 slicelength == PyUnicode_GET_LENGTH(self)) {
13198 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013199 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013200 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013201 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013202 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013203 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013204 src_kind = PyUnicode_KIND(self);
13205 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013206 if (!PyUnicode_IS_ASCII(self)) {
13207 kind_limit = kind_maxchar_limit(src_kind);
13208 max_char = 0;
13209 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13210 ch = PyUnicode_READ(src_kind, src_data, cur);
13211 if (ch > max_char) {
13212 max_char = ch;
13213 if (max_char >= kind_limit)
13214 break;
13215 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013216 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013217 }
Victor Stinner55c99112011-10-13 01:17:06 +020013218 else
13219 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013220 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013221 if (result == NULL)
13222 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013223 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013224 dest_data = PyUnicode_DATA(result);
13225
13226 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013227 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13228 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013229 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013230 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013231 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013232 } else {
13233 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13234 return NULL;
13235 }
13236}
13237
13238static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013239 (lenfunc)unicode_length, /* mp_length */
13240 (binaryfunc)unicode_subscript, /* mp_subscript */
13241 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013242};
13243
Guido van Rossumd57fd912000-03-10 22:53:23 +000013244
Guido van Rossumd57fd912000-03-10 22:53:23 +000013245/* Helpers for PyUnicode_Format() */
13246
Victor Stinnera47082312012-10-04 02:19:54 +020013247struct unicode_formatter_t {
13248 PyObject *args;
13249 int args_owned;
13250 Py_ssize_t arglen, argidx;
13251 PyObject *dict;
13252
13253 enum PyUnicode_Kind fmtkind;
13254 Py_ssize_t fmtcnt, fmtpos;
13255 void *fmtdata;
13256 PyObject *fmtstr;
13257
13258 _PyUnicodeWriter writer;
13259};
13260
13261struct unicode_format_arg_t {
13262 Py_UCS4 ch;
13263 int flags;
13264 Py_ssize_t width;
13265 int prec;
13266 int sign;
13267};
13268
Guido van Rossumd57fd912000-03-10 22:53:23 +000013269static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013270unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013271{
Victor Stinnera47082312012-10-04 02:19:54 +020013272 Py_ssize_t argidx = ctx->argidx;
13273
13274 if (argidx < ctx->arglen) {
13275 ctx->argidx++;
13276 if (ctx->arglen < 0)
13277 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013278 else
Victor Stinnera47082312012-10-04 02:19:54 +020013279 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013280 }
13281 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013282 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013283 return NULL;
13284}
13285
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013286/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013287
Victor Stinnera47082312012-10-04 02:19:54 +020013288/* Format a float into the writer if the writer is not NULL, or into *p_output
13289 otherwise.
13290
13291 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013292static int
Victor Stinnera47082312012-10-04 02:19:54 +020013293formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13294 PyObject **p_output,
13295 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013297 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013298 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013299 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013300 int prec;
13301 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013302
Guido van Rossumd57fd912000-03-10 22:53:23 +000013303 x = PyFloat_AsDouble(v);
13304 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013305 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013306
Victor Stinnera47082312012-10-04 02:19:54 +020013307 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013308 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013309 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013310
Victor Stinnera47082312012-10-04 02:19:54 +020013311 if (arg->flags & F_ALT)
13312 dtoa_flags = Py_DTSF_ALT;
13313 else
13314 dtoa_flags = 0;
13315 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013316 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013317 return -1;
13318 len = strlen(p);
13319 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013320 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13321 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013322 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013323 }
Victor Stinner184252a2012-06-16 02:57:41 +020013324 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013325 writer->pos += len;
13326 }
13327 else
13328 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013329 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013330 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013331}
13332
Victor Stinnerd0880d52012-04-27 23:40:13 +020013333/* formatlong() emulates the format codes d, u, o, x and X, and
13334 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13335 * Python's regular ints.
13336 * Return value: a new PyUnicodeObject*, or NULL if error.
13337 * The output string is of the form
13338 * "-"? ("0x" | "0X")? digit+
13339 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13340 * set in flags. The case of hex digits will be correct,
13341 * There will be at least prec digits, zero-filled on the left if
13342 * necessary to get that many.
13343 * val object to be converted
13344 * flags bitmask of format flags; only F_ALT is looked at
13345 * prec minimum number of digits; 0-fill on left if needed
13346 * type a character in [duoxX]; u acts the same as d
13347 *
13348 * CAUTION: o, x and X conversions on regular ints can never
13349 * produce a '-' sign, but can for Python's unbounded ints.
13350 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013351static PyObject*
Victor Stinnera47082312012-10-04 02:19:54 +020013352formatlong(PyObject *val, struct unicode_format_arg_t *arg)
Tim Peters38fd5b62000-09-21 05:43:11 +000013353{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013354 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013355 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013356 Py_ssize_t i;
13357 int sign; /* 1 if '-', else 0 */
13358 int len; /* number of characters */
13359 Py_ssize_t llen;
13360 int numdigits; /* len == numnondigits + numdigits */
13361 int numnondigits = 0;
Victor Stinnera47082312012-10-04 02:19:54 +020013362 int prec = arg->prec;
13363 int type = arg->ch;
Tim Peters38fd5b62000-09-21 05:43:11 +000013364
Victor Stinnerd0880d52012-04-27 23:40:13 +020013365 /* Avoid exceeding SSIZE_T_MAX */
13366 if (prec > INT_MAX-3) {
13367 PyErr_SetString(PyExc_OverflowError,
13368 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013369 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013370 }
13371
13372 assert(PyLong_Check(val));
13373
13374 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013375 default:
13376 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013377 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013378 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013379 case 'u':
13380 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013381 if (PyBool_Check(val))
13382 result = PyNumber_ToBase(val, 10);
13383 else
13384 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013385 break;
13386 case 'o':
13387 numnondigits = 2;
13388 result = PyNumber_ToBase(val, 8);
13389 break;
13390 case 'x':
13391 case 'X':
13392 numnondigits = 2;
13393 result = PyNumber_ToBase(val, 16);
13394 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013395 }
13396 if (!result)
13397 return NULL;
13398
13399 assert(unicode_modifiable(result));
13400 assert(PyUnicode_IS_READY(result));
13401 assert(PyUnicode_IS_ASCII(result));
13402
13403 /* To modify the string in-place, there can only be one reference. */
13404 if (Py_REFCNT(result) != 1) {
13405 PyErr_BadInternalCall();
13406 return NULL;
13407 }
13408 buf = PyUnicode_DATA(result);
13409 llen = PyUnicode_GET_LENGTH(result);
13410 if (llen > INT_MAX) {
13411 PyErr_SetString(PyExc_ValueError,
13412 "string too large in _PyBytes_FormatLong");
13413 return NULL;
13414 }
13415 len = (int)llen;
13416 sign = buf[0] == '-';
13417 numnondigits += sign;
13418 numdigits = len - numnondigits;
13419 assert(numdigits > 0);
13420
13421 /* Get rid of base marker unless F_ALT */
Victor Stinnera47082312012-10-04 02:19:54 +020013422 if (((arg->flags & F_ALT) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013423 (type == 'o' || type == 'x' || type == 'X'))) {
13424 assert(buf[sign] == '0');
13425 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13426 buf[sign+1] == 'o');
13427 numnondigits -= 2;
13428 buf += 2;
13429 len -= 2;
13430 if (sign)
13431 buf[0] = '-';
13432 assert(len == numnondigits + numdigits);
13433 assert(numdigits > 0);
13434 }
13435
13436 /* Fill with leading zeroes to meet minimum width. */
13437 if (prec > numdigits) {
13438 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13439 numnondigits + prec);
13440 char *b1;
13441 if (!r1) {
13442 Py_DECREF(result);
13443 return NULL;
13444 }
13445 b1 = PyBytes_AS_STRING(r1);
13446 for (i = 0; i < numnondigits; ++i)
13447 *b1++ = *buf++;
13448 for (i = 0; i < prec - numdigits; i++)
13449 *b1++ = '0';
13450 for (i = 0; i < numdigits; i++)
13451 *b1++ = *buf++;
13452 *b1 = '\0';
13453 Py_DECREF(result);
13454 result = r1;
13455 buf = PyBytes_AS_STRING(result);
13456 len = numnondigits + prec;
13457 }
13458
13459 /* Fix up case for hex conversions. */
13460 if (type == 'X') {
13461 /* Need to convert all lower case letters to upper case.
13462 and need to convert 0x to 0X (and -0x to -0X). */
13463 for (i = 0; i < len; i++)
13464 if (buf[i] >= 'a' && buf[i] <= 'x')
13465 buf[i] -= 'a'-'A';
13466 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013467 if (!PyUnicode_Check(result)
13468 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013469 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013470 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013471 Py_DECREF(result);
13472 result = unicode;
13473 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013474 else if (len != PyUnicode_GET_LENGTH(result)) {
13475 if (PyUnicode_Resize(&result, len) < 0)
13476 Py_CLEAR(result);
13477 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013478 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013479}
13480
Victor Stinner621ef3d2012-10-02 00:33:47 +020013481/* Format an integer.
13482 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013483 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013484 * -1 and raise an exception on error */
13485static int
Victor Stinnera47082312012-10-04 02:19:54 +020013486mainformatlong(PyObject *v,
13487 struct unicode_format_arg_t *arg,
13488 PyObject **p_output,
13489 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013490{
13491 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020013492 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013493
13494 if (!PyNumber_Check(v))
13495 goto wrongtype;
13496
13497 if (!PyLong_Check(v)) {
13498 iobj = PyNumber_Long(v);
13499 if (iobj == NULL) {
13500 if (PyErr_ExceptionMatches(PyExc_TypeError))
13501 goto wrongtype;
13502 return -1;
13503 }
13504 assert(PyLong_Check(iobj));
13505 }
13506 else {
13507 iobj = v;
13508 Py_INCREF(iobj);
13509 }
13510
13511 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020013512 && arg->width == -1 && arg->prec == -1
13513 && !(arg->flags & (F_SIGN | F_BLANK))
13514 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020013515 {
13516 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020013517 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013518 int base;
13519
Victor Stinnera47082312012-10-04 02:19:54 +020013520 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020013521 {
13522 default:
13523 assert(0 && "'type' not in [diuoxX]");
13524 case 'd':
13525 case 'i':
13526 case 'u':
13527 base = 10;
13528 break;
13529 case 'o':
13530 base = 8;
13531 break;
13532 case 'x':
13533 case 'X':
13534 base = 16;
13535 break;
13536 }
13537
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013538 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13539 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013540 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020013541 }
13542 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013543 return 1;
13544 }
13545
Victor Stinnera47082312012-10-04 02:19:54 +020013546 res = formatlong(iobj, arg);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013547 Py_DECREF(iobj);
13548 if (res == NULL)
13549 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020013550 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020013551 return 0;
13552
13553wrongtype:
13554 PyErr_Format(PyExc_TypeError,
13555 "%%%c format: a number is required, "
Victor Stinnera47082312012-10-04 02:19:54 +020013556 "not %.200s",
13557 type, Py_TYPE(v)->tp_name);
Victor Stinner621ef3d2012-10-02 00:33:47 +020013558 return -1;
13559}
13560
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013561static Py_UCS4
13562formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013563{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013564 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013565 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013566 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013567 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013568 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013569 goto onError;
13570 }
13571 else {
13572 /* Integer input truncated to a character */
13573 long x;
13574 x = PyLong_AsLong(v);
13575 if (x == -1 && PyErr_Occurred())
13576 goto onError;
13577
Victor Stinner8faf8212011-12-08 22:14:11 +010013578 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013579 PyErr_SetString(PyExc_OverflowError,
13580 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013581 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013582 }
13583
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013584 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013585 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013586
Benjamin Peterson29060642009-01-31 22:14:21 +000013587 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013588 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013589 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013590 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013591}
13592
Victor Stinnera47082312012-10-04 02:19:54 +020013593/* Parse options of an argument: flags, width, precision.
13594 Handle also "%(name)" syntax.
13595
13596 Return 0 if the argument has been formatted into arg->str.
13597 Return 1 if the argument has been written into ctx->writer,
13598 Raise an exception and return -1 on error. */
13599static int
13600unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13601 struct unicode_format_arg_t *arg)
13602{
13603#define FORMAT_READ(ctx) \
13604 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13605
13606 PyObject *v;
13607
13608 arg->ch = FORMAT_READ(ctx);
13609 if (arg->ch == '(') {
13610 /* Get argument value from a dictionary. Example: "%(name)s". */
13611 Py_ssize_t keystart;
13612 Py_ssize_t keylen;
13613 PyObject *key;
13614 int pcount = 1;
13615
13616 if (ctx->dict == NULL) {
13617 PyErr_SetString(PyExc_TypeError,
13618 "format requires a mapping");
13619 return -1;
13620 }
13621 ++ctx->fmtpos;
13622 --ctx->fmtcnt;
13623 keystart = ctx->fmtpos;
13624 /* Skip over balanced parentheses */
13625 while (pcount > 0 && --ctx->fmtcnt >= 0) {
13626 arg->ch = FORMAT_READ(ctx);
13627 if (arg->ch == ')')
13628 --pcount;
13629 else if (arg->ch == '(')
13630 ++pcount;
13631 ctx->fmtpos++;
13632 }
13633 keylen = ctx->fmtpos - keystart - 1;
13634 if (ctx->fmtcnt < 0 || pcount > 0) {
13635 PyErr_SetString(PyExc_ValueError,
13636 "incomplete format key");
13637 return -1;
13638 }
13639 key = PyUnicode_Substring(ctx->fmtstr,
13640 keystart, keystart + keylen);
13641 if (key == NULL)
13642 return -1;
13643 if (ctx->args_owned) {
13644 Py_DECREF(ctx->args);
13645 ctx->args_owned = 0;
13646 }
13647 ctx->args = PyObject_GetItem(ctx->dict, key);
13648 Py_DECREF(key);
13649 if (ctx->args == NULL)
13650 return -1;
13651 ctx->args_owned = 1;
13652 ctx->arglen = -1;
13653 ctx->argidx = -2;
13654 }
13655
13656 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
13657 arg->flags = 0;
13658 while (--ctx->fmtcnt >= 0) {
13659 arg->ch = FORMAT_READ(ctx);
13660 ctx->fmtpos++;
13661 switch (arg->ch) {
13662 case '-': arg->flags |= F_LJUST; continue;
13663 case '+': arg->flags |= F_SIGN; continue;
13664 case ' ': arg->flags |= F_BLANK; continue;
13665 case '#': arg->flags |= F_ALT; continue;
13666 case '0': arg->flags |= F_ZERO; continue;
13667 }
13668 break;
13669 }
13670
13671 /* Parse width. Example: "%10s" => width=10 */
13672 arg->width = -1;
13673 if (arg->ch == '*') {
13674 v = unicode_format_getnextarg(ctx);
13675 if (v == NULL)
13676 return -1;
13677 if (!PyLong_Check(v)) {
13678 PyErr_SetString(PyExc_TypeError,
13679 "* wants int");
13680 return -1;
13681 }
13682 arg->width = PyLong_AsLong(v);
13683 if (arg->width == -1 && PyErr_Occurred())
13684 return -1;
13685 if (arg->width < 0) {
13686 arg->flags |= F_LJUST;
13687 arg->width = -arg->width;
13688 }
13689 if (--ctx->fmtcnt >= 0) {
13690 arg->ch = FORMAT_READ(ctx);
13691 ctx->fmtpos++;
13692 }
13693 }
13694 else if (arg->ch >= '0' && arg->ch <= '9') {
13695 arg->width = arg->ch - '0';
13696 while (--ctx->fmtcnt >= 0) {
13697 arg->ch = FORMAT_READ(ctx);
13698 ctx->fmtpos++;
13699 if (arg->ch < '0' || arg->ch > '9')
13700 break;
13701 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13702 mixing signed and unsigned comparison. Since arg->ch is between
13703 '0' and '9', casting to int is safe. */
13704 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13705 PyErr_SetString(PyExc_ValueError,
13706 "width too big");
13707 return -1;
13708 }
13709 arg->width = arg->width*10 + (arg->ch - '0');
13710 }
13711 }
13712
13713 /* Parse precision. Example: "%.3f" => prec=3 */
13714 arg->prec = -1;
13715 if (arg->ch == '.') {
13716 arg->prec = 0;
13717 if (--ctx->fmtcnt >= 0) {
13718 arg->ch = FORMAT_READ(ctx);
13719 ctx->fmtpos++;
13720 }
13721 if (arg->ch == '*') {
13722 v = unicode_format_getnextarg(ctx);
13723 if (v == NULL)
13724 return -1;
13725 if (!PyLong_Check(v)) {
13726 PyErr_SetString(PyExc_TypeError,
13727 "* wants int");
13728 return -1;
13729 }
13730 arg->prec = PyLong_AsLong(v);
13731 if (arg->prec == -1 && PyErr_Occurred())
13732 return -1;
13733 if (arg->prec < 0)
13734 arg->prec = 0;
13735 if (--ctx->fmtcnt >= 0) {
13736 arg->ch = FORMAT_READ(ctx);
13737 ctx->fmtpos++;
13738 }
13739 }
13740 else if (arg->ch >= '0' && arg->ch <= '9') {
13741 arg->prec = arg->ch - '0';
13742 while (--ctx->fmtcnt >= 0) {
13743 arg->ch = FORMAT_READ(ctx);
13744 ctx->fmtpos++;
13745 if (arg->ch < '0' || arg->ch > '9')
13746 break;
13747 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13748 PyErr_SetString(PyExc_ValueError,
13749 "prec too big");
13750 return -1;
13751 }
13752 arg->prec = arg->prec*10 + (arg->ch - '0');
13753 }
13754 }
13755 }
13756
13757 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13758 if (ctx->fmtcnt >= 0) {
13759 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13760 if (--ctx->fmtcnt >= 0) {
13761 arg->ch = FORMAT_READ(ctx);
13762 ctx->fmtpos++;
13763 }
13764 }
13765 }
13766 if (ctx->fmtcnt < 0) {
13767 PyErr_SetString(PyExc_ValueError,
13768 "incomplete format");
13769 return -1;
13770 }
13771 return 0;
13772
13773#undef FORMAT_READ
13774}
13775
13776/* Format one argument. Supported conversion specifiers:
13777
13778 - "s", "r", "a": any type
13779 - "i", "d", "u", "o", "x", "X": int
13780 - "e", "E", "f", "F", "g", "G": float
13781 - "c": int or str (1 character)
13782
13783 Return 0 if the argument has been formatted into *p_str,
13784 1 if the argument has been written into ctx->writer,
13785 -1 on error. */
13786static int
13787unicode_format_arg_format(struct unicode_formatter_t *ctx,
13788 struct unicode_format_arg_t *arg,
13789 PyObject **p_str)
13790{
13791 PyObject *v;
13792 _PyUnicodeWriter *writer = &ctx->writer;
13793
13794 if (ctx->fmtcnt == 0)
13795 ctx->writer.overallocate = 0;
13796
13797 if (arg->ch == '%') {
13798 if (_PyUnicodeWriter_Prepare(writer, 1, '%') == -1)
13799 return -1;
13800 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '%');
13801 writer->pos += 1;
13802 return 1;
13803 }
13804
13805 v = unicode_format_getnextarg(ctx);
13806 if (v == NULL)
13807 return -1;
13808
13809 arg->sign = 0;
13810
13811 switch (arg->ch) {
13812
13813 case 's':
13814 case 'r':
13815 case 'a':
13816 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13817 /* Fast path */
13818 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13819 return -1;
13820 return 1;
13821 }
13822
13823 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13824 *p_str = v;
13825 Py_INCREF(*p_str);
13826 }
13827 else {
13828 if (arg->ch == 's')
13829 *p_str = PyObject_Str(v);
13830 else if (arg->ch == 'r')
13831 *p_str = PyObject_Repr(v);
13832 else
13833 *p_str = PyObject_ASCII(v);
13834 }
13835 break;
13836
13837 case 'i':
13838 case 'd':
13839 case 'u':
13840 case 'o':
13841 case 'x':
13842 case 'X':
13843 {
13844 int ret = mainformatlong(v, arg, p_str, writer);
13845 if (ret != 0)
13846 return ret;
13847 arg->sign = 1;
13848 break;
13849 }
13850
13851 case 'e':
13852 case 'E':
13853 case 'f':
13854 case 'F':
13855 case 'g':
13856 case 'G':
13857 if (arg->width == -1 && arg->prec == -1
13858 && !(arg->flags & (F_SIGN | F_BLANK)))
13859 {
13860 /* Fast path */
13861 if (formatfloat(v, arg, NULL, writer) == -1)
13862 return -1;
13863 return 1;
13864 }
13865
13866 arg->sign = 1;
13867 if (formatfloat(v, arg, p_str, NULL) == -1)
13868 return -1;
13869 break;
13870
13871 case 'c':
13872 {
13873 Py_UCS4 ch = formatchar(v);
13874 if (ch == (Py_UCS4) -1)
13875 return -1;
13876 if (arg->width == -1 && arg->prec == -1) {
13877 /* Fast path */
13878 if (_PyUnicodeWriter_Prepare(writer, 1, ch) == -1)
13879 return -1;
13880 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13881 writer->pos += 1;
13882 return 1;
13883 }
13884 *p_str = PyUnicode_FromOrdinal(ch);
13885 break;
13886 }
13887
13888 default:
13889 PyErr_Format(PyExc_ValueError,
13890 "unsupported format character '%c' (0x%x) "
13891 "at index %zd",
13892 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
13893 (int)arg->ch,
13894 ctx->fmtpos - 1);
13895 return -1;
13896 }
13897 if (*p_str == NULL)
13898 return -1;
13899 assert (PyUnicode_Check(*p_str));
13900 return 0;
13901}
13902
13903static int
13904unicode_format_arg_output(struct unicode_formatter_t *ctx,
13905 struct unicode_format_arg_t *arg,
13906 PyObject *str)
13907{
13908 Py_ssize_t len;
13909 enum PyUnicode_Kind kind;
13910 void *pbuf;
13911 Py_ssize_t pindex;
13912 Py_UCS4 signchar;
13913 Py_ssize_t buflen;
13914 Py_UCS4 maxchar, bufmaxchar;
13915 Py_ssize_t sublen;
13916 _PyUnicodeWriter *writer = &ctx->writer;
13917 Py_UCS4 fill;
13918
13919 fill = ' ';
13920 if (arg->sign && arg->flags & F_ZERO)
13921 fill = '0';
13922
13923 if (PyUnicode_READY(str) == -1)
13924 return -1;
13925
13926 len = PyUnicode_GET_LENGTH(str);
13927 if ((arg->width == -1 || arg->width <= len)
13928 && (arg->prec == -1 || arg->prec >= len)
13929 && !(arg->flags & (F_SIGN | F_BLANK)))
13930 {
13931 /* Fast path */
13932 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
13933 return -1;
13934 return 0;
13935 }
13936
13937 /* Truncate the string for "s", "r" and "a" formats
13938 if the precision is set */
13939 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
13940 if (arg->prec >= 0 && len > arg->prec)
13941 len = arg->prec;
13942 }
13943
13944 /* Adjust sign and width */
13945 kind = PyUnicode_KIND(str);
13946 pbuf = PyUnicode_DATA(str);
13947 pindex = 0;
13948 signchar = '\0';
13949 if (arg->sign) {
13950 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13951 if (ch == '-' || ch == '+') {
13952 signchar = ch;
13953 len--;
13954 pindex++;
13955 }
13956 else if (arg->flags & F_SIGN)
13957 signchar = '+';
13958 else if (arg->flags & F_BLANK)
13959 signchar = ' ';
13960 else
13961 arg->sign = 0;
13962 }
13963 if (arg->width < len)
13964 arg->width = len;
13965
13966 /* Prepare the writer */
13967 bufmaxchar = 127;
13968 if (!(arg->flags & F_LJUST)) {
13969 if (arg->sign) {
13970 if ((arg->width-1) > len)
13971 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13972 }
13973 else {
13974 if (arg->width > len)
13975 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
13976 }
13977 }
13978 maxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
13979 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
13980 buflen = arg->width;
13981 if (arg->sign && len == arg->width)
13982 buflen++;
13983 if (_PyUnicodeWriter_Prepare(writer, buflen, bufmaxchar) == -1)
13984 return -1;
13985
13986 /* Write the sign if needed */
13987 if (arg->sign) {
13988 if (fill != ' ') {
13989 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
13990 writer->pos += 1;
13991 }
13992 if (arg->width > len)
13993 arg->width--;
13994 }
13995
13996 /* Write the numeric prefix for "x", "X" and "o" formats
13997 if the alternate form is used.
13998 For example, write "0x" for the "%#x" format. */
13999 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14000 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14001 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14002 if (fill != ' ') {
14003 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14004 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14005 writer->pos += 2;
14006 pindex += 2;
14007 }
14008 arg->width -= 2;
14009 if (arg->width < 0)
14010 arg->width = 0;
14011 len -= 2;
14012 }
14013
14014 /* Pad left with the fill character if needed */
14015 if (arg->width > len && !(arg->flags & F_LJUST)) {
14016 sublen = arg->width - len;
14017 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14018 writer->pos += sublen;
14019 arg->width = len;
14020 }
14021
14022 /* If padding with spaces: write sign if needed and/or numeric prefix if
14023 the alternate form is used */
14024 if (fill == ' ') {
14025 if (arg->sign) {
14026 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14027 writer->pos += 1;
14028 }
14029 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14030 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14031 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14032 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14033 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14034 writer->pos += 2;
14035 pindex += 2;
14036 }
14037 }
14038
14039 /* Write characters */
14040 if (len) {
14041 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14042 str, pindex, len);
14043 writer->pos += len;
14044 }
14045
14046 /* Pad right with the fill character if needed */
14047 if (arg->width > len) {
14048 sublen = arg->width - len;
14049 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14050 writer->pos += sublen;
14051 }
14052 return 0;
14053}
14054
14055/* Helper of PyUnicode_Format(): format one arg.
14056 Return 0 on success, raise an exception and return -1 on error. */
14057static int
14058unicode_format_arg(struct unicode_formatter_t *ctx)
14059{
14060 struct unicode_format_arg_t arg;
14061 PyObject *str;
14062 int ret;
14063
14064 ret = unicode_format_arg_parse(ctx, &arg);
14065 if (ret == -1)
14066 return -1;
14067
14068 ret = unicode_format_arg_format(ctx, &arg, &str);
14069 if (ret == -1)
14070 return -1;
14071
14072 if (ret != 1) {
14073 ret = unicode_format_arg_output(ctx, &arg, str);
14074 Py_DECREF(str);
14075 if (ret == -1)
14076 return -1;
14077 }
14078
14079 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14080 PyErr_SetString(PyExc_TypeError,
14081 "not all arguments converted during string formatting");
14082 return -1;
14083 }
14084 return 0;
14085}
14086
Alexander Belopolsky40018472011-02-26 01:02:56 +000014087PyObject *
14088PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014089{
Victor Stinnera47082312012-10-04 02:19:54 +020014090 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014091
Guido van Rossumd57fd912000-03-10 22:53:23 +000014092 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014093 PyErr_BadInternalCall();
14094 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014095 }
Victor Stinnera47082312012-10-04 02:19:54 +020014096
14097 ctx.fmtstr = PyUnicode_FromObject(format);
14098 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014099 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014100 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14101 Py_DECREF(ctx.fmtstr);
14102 return NULL;
14103 }
14104 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14105 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14106 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14107 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014108
Victor Stinnera47082312012-10-04 02:19:54 +020014109 _PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014110
Guido van Rossumd57fd912000-03-10 22:53:23 +000014111 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014112 ctx.arglen = PyTuple_Size(args);
14113 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014114 }
14115 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014116 ctx.arglen = -1;
14117 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014118 }
Victor Stinnera47082312012-10-04 02:19:54 +020014119 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014120 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014121 ctx.dict = args;
14122 else
14123 ctx.dict = NULL;
14124 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014125
Victor Stinnera47082312012-10-04 02:19:54 +020014126 while (--ctx.fmtcnt >= 0) {
14127 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14128 Py_ssize_t nonfmtpos, sublen;
14129 Py_UCS4 maxchar;
14130
14131 nonfmtpos = ctx.fmtpos++;
14132 while (ctx.fmtcnt >= 0 &&
14133 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14134 ctx.fmtpos++;
14135 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014136 }
Victor Stinnera47082312012-10-04 02:19:54 +020014137 if (ctx.fmtcnt < 0) {
14138 ctx.fmtpos--;
14139 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014140 }
Victor Stinnera47082312012-10-04 02:19:54 +020014141 sublen = ctx.fmtpos - nonfmtpos;
14142 maxchar = _PyUnicode_FindMaxChar(ctx.fmtstr,
Victor Stinneree4544c2012-05-09 22:24:08 +020014143 nonfmtpos, nonfmtpos + sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014144 if (_PyUnicodeWriter_Prepare(&ctx.writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014145 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020014146
Victor Stinnera47082312012-10-04 02:19:54 +020014147 _PyUnicode_FastCopyCharacters(ctx.writer.buffer, ctx.writer.pos,
14148 ctx.fmtstr, nonfmtpos, sublen);
14149 ctx.writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014150 }
14151 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014152 ctx.fmtpos++;
14153 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014154 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014155 }
14156 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014157
Victor Stinnera47082312012-10-04 02:19:54 +020014158 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014159 PyErr_SetString(PyExc_TypeError,
14160 "not all arguments converted during string formatting");
14161 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014162 }
14163
Victor Stinnera47082312012-10-04 02:19:54 +020014164 if (ctx.args_owned) {
14165 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014166 }
Victor Stinnera47082312012-10-04 02:19:54 +020014167 Py_DECREF(ctx.fmtstr);
14168 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014169
Benjamin Peterson29060642009-01-31 22:14:21 +000014170 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014171 Py_DECREF(ctx.fmtstr);
14172 _PyUnicodeWriter_Dealloc(&ctx.writer);
14173 if (ctx.args_owned) {
14174 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014175 }
14176 return NULL;
14177}
14178
Jeremy Hylton938ace62002-07-17 16:30:39 +000014179static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014180unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14181
Tim Peters6d6c1a32001-08-02 04:15:00 +000014182static PyObject *
14183unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14184{
Benjamin Peterson29060642009-01-31 22:14:21 +000014185 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014186 static char *kwlist[] = {"object", "encoding", "errors", 0};
14187 char *encoding = NULL;
14188 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014189
Benjamin Peterson14339b62009-01-31 16:36:08 +000014190 if (type != &PyUnicode_Type)
14191 return unicode_subtype_new(type, args, kwds);
14192 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014193 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014194 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010014195 if (x == NULL) {
14196 Py_INCREF(unicode_empty);
14197 return unicode_empty;
14198 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014199 if (encoding == NULL && errors == NULL)
14200 return PyObject_Str(x);
14201 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014202 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014203}
14204
Guido van Rossume023fe02001-08-30 03:12:59 +000014205static PyObject *
14206unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14207{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014208 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014209 Py_ssize_t length, char_size;
14210 int share_wstr, share_utf8;
14211 unsigned int kind;
14212 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014213
Benjamin Peterson14339b62009-01-31 16:36:08 +000014214 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014215
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014216 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014217 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014218 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014219 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014220 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014221 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014222 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014223 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014224
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014225 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014226 if (self == NULL) {
14227 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014228 return NULL;
14229 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014230 kind = PyUnicode_KIND(unicode);
14231 length = PyUnicode_GET_LENGTH(unicode);
14232
14233 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014234#ifdef Py_DEBUG
14235 _PyUnicode_HASH(self) = -1;
14236#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014237 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014238#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014239 _PyUnicode_STATE(self).interned = 0;
14240 _PyUnicode_STATE(self).kind = kind;
14241 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014242 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014243 _PyUnicode_STATE(self).ready = 1;
14244 _PyUnicode_WSTR(self) = NULL;
14245 _PyUnicode_UTF8_LENGTH(self) = 0;
14246 _PyUnicode_UTF8(self) = NULL;
14247 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014248 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014249
14250 share_utf8 = 0;
14251 share_wstr = 0;
14252 if (kind == PyUnicode_1BYTE_KIND) {
14253 char_size = 1;
14254 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14255 share_utf8 = 1;
14256 }
14257 else if (kind == PyUnicode_2BYTE_KIND) {
14258 char_size = 2;
14259 if (sizeof(wchar_t) == 2)
14260 share_wstr = 1;
14261 }
14262 else {
14263 assert(kind == PyUnicode_4BYTE_KIND);
14264 char_size = 4;
14265 if (sizeof(wchar_t) == 4)
14266 share_wstr = 1;
14267 }
14268
14269 /* Ensure we won't overflow the length. */
14270 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14271 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014272 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014273 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014274 data = PyObject_MALLOC((length + 1) * char_size);
14275 if (data == NULL) {
14276 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014277 goto onError;
14278 }
14279
Victor Stinnerc3c74152011-10-02 20:39:55 +020014280 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014281 if (share_utf8) {
14282 _PyUnicode_UTF8_LENGTH(self) = length;
14283 _PyUnicode_UTF8(self) = data;
14284 }
14285 if (share_wstr) {
14286 _PyUnicode_WSTR_LENGTH(self) = length;
14287 _PyUnicode_WSTR(self) = (wchar_t *)data;
14288 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014289
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014290 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014291 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014292 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014293#ifdef Py_DEBUG
14294 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14295#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014296 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014297 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014298
14299onError:
14300 Py_DECREF(unicode);
14301 Py_DECREF(self);
14302 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014303}
14304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014305PyDoc_STRVAR(unicode_doc,
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014306 "str(object[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014307\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014308Create a new string object from the given object. If encoding or\n\
14309errors is specified, then the object must expose a data buffer\n\
14310that will be decoded using the given encoding and error handler.\n\
14311Otherwise, returns the result of object.__str__() (if defined)\n\
14312or repr(object).\n\
14313encoding defaults to sys.getdefaultencoding().\n\
14314errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014315
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014316static PyObject *unicode_iter(PyObject *seq);
14317
Guido van Rossumd57fd912000-03-10 22:53:23 +000014318PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014319 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014320 "str", /* tp_name */
14321 sizeof(PyUnicodeObject), /* tp_size */
14322 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014323 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014324 (destructor)unicode_dealloc, /* tp_dealloc */
14325 0, /* tp_print */
14326 0, /* tp_getattr */
14327 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014328 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014329 unicode_repr, /* tp_repr */
14330 &unicode_as_number, /* tp_as_number */
14331 &unicode_as_sequence, /* tp_as_sequence */
14332 &unicode_as_mapping, /* tp_as_mapping */
14333 (hashfunc) unicode_hash, /* tp_hash*/
14334 0, /* tp_call*/
14335 (reprfunc) unicode_str, /* tp_str */
14336 PyObject_GenericGetAttr, /* tp_getattro */
14337 0, /* tp_setattro */
14338 0, /* tp_as_buffer */
14339 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014340 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014341 unicode_doc, /* tp_doc */
14342 0, /* tp_traverse */
14343 0, /* tp_clear */
14344 PyUnicode_RichCompare, /* tp_richcompare */
14345 0, /* tp_weaklistoffset */
14346 unicode_iter, /* tp_iter */
14347 0, /* tp_iternext */
14348 unicode_methods, /* tp_methods */
14349 0, /* tp_members */
14350 0, /* tp_getset */
14351 &PyBaseObject_Type, /* tp_base */
14352 0, /* tp_dict */
14353 0, /* tp_descr_get */
14354 0, /* tp_descr_set */
14355 0, /* tp_dictoffset */
14356 0, /* tp_init */
14357 0, /* tp_alloc */
14358 unicode_new, /* tp_new */
14359 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014360};
14361
14362/* Initialize the Unicode implementation */
14363
Victor Stinner3a50e702011-10-18 21:21:00 +020014364int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014365{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014366 int i;
14367
Thomas Wouters477c8d52006-05-27 19:21:47 +000014368 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014369 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014370 0x000A, /* LINE FEED */
14371 0x000D, /* CARRIAGE RETURN */
14372 0x001C, /* FILE SEPARATOR */
14373 0x001D, /* GROUP SEPARATOR */
14374 0x001E, /* RECORD SEPARATOR */
14375 0x0085, /* NEXT LINE */
14376 0x2028, /* LINE SEPARATOR */
14377 0x2029, /* PARAGRAPH SEPARATOR */
14378 };
14379
Fred Drakee4315f52000-05-09 19:53:39 +000014380 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014381 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014382 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014383 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014384 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014385
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014386 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014387 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014388 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014389 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014390
14391 /* initialize the linebreak bloom filter */
14392 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014393 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014394 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014395
14396 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014397
14398#ifdef HAVE_MBCS
14399 winver.dwOSVersionInfoSize = sizeof(winver);
14400 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14401 PyErr_SetFromWindowsErr(0);
14402 return -1;
14403 }
14404#endif
14405 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014406}
14407
14408/* Finalize the Unicode implementation */
14409
Christian Heimesa156e092008-02-16 07:38:31 +000014410int
14411PyUnicode_ClearFreeList(void)
14412{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014413 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014414}
14415
Guido van Rossumd57fd912000-03-10 22:53:23 +000014416void
Thomas Wouters78890102000-07-22 19:25:51 +000014417_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014418{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014419 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014420
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014421 Py_XDECREF(unicode_empty);
14422 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014423
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014424 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014425 if (unicode_latin1[i]) {
14426 Py_DECREF(unicode_latin1[i]);
14427 unicode_latin1[i] = NULL;
14428 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014429 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014430 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014431 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014432}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014433
Walter Dörwald16807132007-05-25 13:52:07 +000014434void
14435PyUnicode_InternInPlace(PyObject **p)
14436{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014437 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014438 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014439#ifdef Py_DEBUG
14440 assert(s != NULL);
14441 assert(_PyUnicode_CHECK(s));
14442#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014443 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014444 return;
14445#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014446 /* If it's a subclass, we don't really know what putting
14447 it in the interned dict might do. */
14448 if (!PyUnicode_CheckExact(s))
14449 return;
14450 if (PyUnicode_CHECK_INTERNED(s))
14451 return;
14452 if (interned == NULL) {
14453 interned = PyDict_New();
14454 if (interned == NULL) {
14455 PyErr_Clear(); /* Don't leave an exception */
14456 return;
14457 }
14458 }
14459 /* It might be that the GetItem call fails even
14460 though the key is present in the dictionary,
14461 namely when this happens during a stack overflow. */
14462 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014463 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014464 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014465
Benjamin Peterson29060642009-01-31 22:14:21 +000014466 if (t) {
14467 Py_INCREF(t);
14468 Py_DECREF(*p);
14469 *p = t;
14470 return;
14471 }
Walter Dörwald16807132007-05-25 13:52:07 +000014472
Benjamin Peterson14339b62009-01-31 16:36:08 +000014473 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014474 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014475 PyErr_Clear();
14476 PyThreadState_GET()->recursion_critical = 0;
14477 return;
14478 }
14479 PyThreadState_GET()->recursion_critical = 0;
14480 /* The two references in interned are not counted by refcnt.
14481 The deallocator will take care of this */
14482 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014483 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014484}
14485
14486void
14487PyUnicode_InternImmortal(PyObject **p)
14488{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014489 PyUnicode_InternInPlace(p);
14490 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014491 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014492 Py_INCREF(*p);
14493 }
Walter Dörwald16807132007-05-25 13:52:07 +000014494}
14495
14496PyObject *
14497PyUnicode_InternFromString(const char *cp)
14498{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014499 PyObject *s = PyUnicode_FromString(cp);
14500 if (s == NULL)
14501 return NULL;
14502 PyUnicode_InternInPlace(&s);
14503 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014504}
14505
Alexander Belopolsky40018472011-02-26 01:02:56 +000014506void
14507_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014508{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014509 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014510 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014511 Py_ssize_t i, n;
14512 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014513
Benjamin Peterson14339b62009-01-31 16:36:08 +000014514 if (interned == NULL || !PyDict_Check(interned))
14515 return;
14516 keys = PyDict_Keys(interned);
14517 if (keys == NULL || !PyList_Check(keys)) {
14518 PyErr_Clear();
14519 return;
14520 }
Walter Dörwald16807132007-05-25 13:52:07 +000014521
Benjamin Peterson14339b62009-01-31 16:36:08 +000014522 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14523 detector, interned unicode strings are not forcibly deallocated;
14524 rather, we give them their stolen references back, and then clear
14525 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014526
Benjamin Peterson14339b62009-01-31 16:36:08 +000014527 n = PyList_GET_SIZE(keys);
14528 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014529 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014530 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014531 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014532 if (PyUnicode_READY(s) == -1) {
14533 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014534 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014535 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014536 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014537 case SSTATE_NOT_INTERNED:
14538 /* XXX Shouldn't happen */
14539 break;
14540 case SSTATE_INTERNED_IMMORTAL:
14541 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014542 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014543 break;
14544 case SSTATE_INTERNED_MORTAL:
14545 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014546 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014547 break;
14548 default:
14549 Py_FatalError("Inconsistent interned string state.");
14550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014551 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014552 }
14553 fprintf(stderr, "total size of all interned strings: "
14554 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14555 "mortal/immortal\n", mortal_size, immortal_size);
14556 Py_DECREF(keys);
14557 PyDict_Clear(interned);
14558 Py_DECREF(interned);
14559 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014560}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014561
14562
14563/********************* Unicode Iterator **************************/
14564
14565typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014566 PyObject_HEAD
14567 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014568 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014569} unicodeiterobject;
14570
14571static void
14572unicodeiter_dealloc(unicodeiterobject *it)
14573{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014574 _PyObject_GC_UNTRACK(it);
14575 Py_XDECREF(it->it_seq);
14576 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014577}
14578
14579static int
14580unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14581{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014582 Py_VISIT(it->it_seq);
14583 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014584}
14585
14586static PyObject *
14587unicodeiter_next(unicodeiterobject *it)
14588{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014589 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014590
Benjamin Peterson14339b62009-01-31 16:36:08 +000014591 assert(it != NULL);
14592 seq = it->it_seq;
14593 if (seq == NULL)
14594 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014595 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014597 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14598 int kind = PyUnicode_KIND(seq);
14599 void *data = PyUnicode_DATA(seq);
14600 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14601 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014602 if (item != NULL)
14603 ++it->it_index;
14604 return item;
14605 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014606
Benjamin Peterson14339b62009-01-31 16:36:08 +000014607 Py_DECREF(seq);
14608 it->it_seq = NULL;
14609 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014610}
14611
14612static PyObject *
14613unicodeiter_len(unicodeiterobject *it)
14614{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014615 Py_ssize_t len = 0;
14616 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014617 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014618 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014619}
14620
14621PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14622
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014623static PyObject *
14624unicodeiter_reduce(unicodeiterobject *it)
14625{
14626 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014627 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014628 it->it_seq, it->it_index);
14629 } else {
14630 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14631 if (u == NULL)
14632 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014633 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014634 }
14635}
14636
14637PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14638
14639static PyObject *
14640unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14641{
14642 Py_ssize_t index = PyLong_AsSsize_t(state);
14643 if (index == -1 && PyErr_Occurred())
14644 return NULL;
14645 if (index < 0)
14646 index = 0;
14647 it->it_index = index;
14648 Py_RETURN_NONE;
14649}
14650
14651PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14652
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014653static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014654 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014655 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014656 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14657 reduce_doc},
14658 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14659 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014660 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014661};
14662
14663PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014664 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14665 "str_iterator", /* tp_name */
14666 sizeof(unicodeiterobject), /* tp_basicsize */
14667 0, /* tp_itemsize */
14668 /* methods */
14669 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14670 0, /* tp_print */
14671 0, /* tp_getattr */
14672 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014673 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014674 0, /* tp_repr */
14675 0, /* tp_as_number */
14676 0, /* tp_as_sequence */
14677 0, /* tp_as_mapping */
14678 0, /* tp_hash */
14679 0, /* tp_call */
14680 0, /* tp_str */
14681 PyObject_GenericGetAttr, /* tp_getattro */
14682 0, /* tp_setattro */
14683 0, /* tp_as_buffer */
14684 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14685 0, /* tp_doc */
14686 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14687 0, /* tp_clear */
14688 0, /* tp_richcompare */
14689 0, /* tp_weaklistoffset */
14690 PyObject_SelfIter, /* tp_iter */
14691 (iternextfunc)unicodeiter_next, /* tp_iternext */
14692 unicodeiter_methods, /* tp_methods */
14693 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014694};
14695
14696static PyObject *
14697unicode_iter(PyObject *seq)
14698{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014699 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014700
Benjamin Peterson14339b62009-01-31 16:36:08 +000014701 if (!PyUnicode_Check(seq)) {
14702 PyErr_BadInternalCall();
14703 return NULL;
14704 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014705 if (PyUnicode_READY(seq) == -1)
14706 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014707 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14708 if (it == NULL)
14709 return NULL;
14710 it->it_index = 0;
14711 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014712 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014713 _PyObject_GC_TRACK(it);
14714 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014715}
14716
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014717
14718size_t
14719Py_UNICODE_strlen(const Py_UNICODE *u)
14720{
14721 int res = 0;
14722 while(*u++)
14723 res++;
14724 return res;
14725}
14726
14727Py_UNICODE*
14728Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14729{
14730 Py_UNICODE *u = s1;
14731 while ((*u++ = *s2++));
14732 return s1;
14733}
14734
14735Py_UNICODE*
14736Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14737{
14738 Py_UNICODE *u = s1;
14739 while ((*u++ = *s2++))
14740 if (n-- == 0)
14741 break;
14742 return s1;
14743}
14744
14745Py_UNICODE*
14746Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14747{
14748 Py_UNICODE *u1 = s1;
14749 u1 += Py_UNICODE_strlen(u1);
14750 Py_UNICODE_strcpy(u1, s2);
14751 return s1;
14752}
14753
14754int
14755Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14756{
14757 while (*s1 && *s2 && *s1 == *s2)
14758 s1++, s2++;
14759 if (*s1 && *s2)
14760 return (*s1 < *s2) ? -1 : +1;
14761 if (*s1)
14762 return 1;
14763 if (*s2)
14764 return -1;
14765 return 0;
14766}
14767
14768int
14769Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14770{
14771 register Py_UNICODE u1, u2;
14772 for (; n != 0; n--) {
14773 u1 = *s1;
14774 u2 = *s2;
14775 if (u1 != u2)
14776 return (u1 < u2) ? -1 : +1;
14777 if (u1 == '\0')
14778 return 0;
14779 s1++;
14780 s2++;
14781 }
14782 return 0;
14783}
14784
14785Py_UNICODE*
14786Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14787{
14788 const Py_UNICODE *p;
14789 for (p = s; *p; p++)
14790 if (*p == c)
14791 return (Py_UNICODE*)p;
14792 return NULL;
14793}
14794
14795Py_UNICODE*
14796Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14797{
14798 const Py_UNICODE *p;
14799 p = s + Py_UNICODE_strlen(s);
14800 while (p != s) {
14801 p--;
14802 if (*p == c)
14803 return (Py_UNICODE*)p;
14804 }
14805 return NULL;
14806}
Victor Stinner331ea922010-08-10 16:37:20 +000014807
Victor Stinner71133ff2010-09-01 23:43:53 +000014808Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014809PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014810{
Victor Stinner577db2c2011-10-11 22:12:48 +020014811 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014812 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014814 if (!PyUnicode_Check(unicode)) {
14815 PyErr_BadArgument();
14816 return NULL;
14817 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014818 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014819 if (u == NULL)
14820 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014821 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014822 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014823 PyErr_NoMemory();
14824 return NULL;
14825 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014826 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014827 size *= sizeof(Py_UNICODE);
14828 copy = PyMem_Malloc(size);
14829 if (copy == NULL) {
14830 PyErr_NoMemory();
14831 return NULL;
14832 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014833 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014834 return copy;
14835}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014836
Georg Brandl66c221e2010-10-14 07:04:07 +000014837/* A _string module, to export formatter_parser and formatter_field_name_split
14838 to the string.Formatter class implemented in Python. */
14839
14840static PyMethodDef _string_methods[] = {
14841 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14842 METH_O, PyDoc_STR("split the argument as a field name")},
14843 {"formatter_parser", (PyCFunction) formatter_parser,
14844 METH_O, PyDoc_STR("parse the argument as a format string")},
14845 {NULL, NULL}
14846};
14847
14848static struct PyModuleDef _string_module = {
14849 PyModuleDef_HEAD_INIT,
14850 "_string",
14851 PyDoc_STR("string helper module"),
14852 0,
14853 _string_methods,
14854 NULL,
14855 NULL,
14856 NULL,
14857 NULL
14858};
14859
14860PyMODINIT_FUNC
14861PyInit__string(void)
14862{
14863 return PyModule_Create(&_string_module);
14864}
14865
14866
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014867#ifdef __cplusplus
14868}
14869#endif