blob: 0b9d65291f70245b25d8fded825609620a478401 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
60 The globals are initialized by the _PyUnicode_Init() API and should
61 not be used before calling that API.
62
63*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000064
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065
66#ifdef __cplusplus
67extern "C" {
68#endif
69
Victor Stinner8faf8212011-12-08 22:14:11 +010070/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
71#define MAX_UNICODE 0x10ffff
72
Victor Stinner910337b2011-10-03 03:20:16 +020073#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020074# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020075#else
76# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
77#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020078
Victor Stinnere90fe6a2011-10-01 16:48:13 +020079#define _PyUnicode_UTF8(op) \
80 (((PyCompactUnicodeObject*)(op))->utf8)
81#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020082 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020083 assert(PyUnicode_IS_READY(op)), \
84 PyUnicode_IS_COMPACT_ASCII(op) ? \
85 ((char*)((PyASCIIObject*)(op) + 1)) : \
86 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020087#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020088 (((PyCompactUnicodeObject*)(op))->utf8_length)
89#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020090 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020091 assert(PyUnicode_IS_READY(op)), \
92 PyUnicode_IS_COMPACT_ASCII(op) ? \
93 ((PyASCIIObject*)(op))->length : \
94 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020095#define _PyUnicode_WSTR(op) \
96 (((PyASCIIObject*)(op))->wstr)
97#define _PyUnicode_WSTR_LENGTH(op) \
98 (((PyCompactUnicodeObject*)(op))->wstr_length)
99#define _PyUnicode_LENGTH(op) \
100 (((PyASCIIObject *)(op))->length)
101#define _PyUnicode_STATE(op) \
102 (((PyASCIIObject *)(op))->state)
103#define _PyUnicode_HASH(op) \
104 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200105#define _PyUnicode_KIND(op) \
106 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200107 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200108#define _PyUnicode_GET_LENGTH(op) \
109 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200110 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200111#define _PyUnicode_DATA_ANY(op) \
112 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200113
Victor Stinnere6abb482012-05-02 01:15:40 +0200114/* Optimized version of Py_MAX() to compute the maximum character:
115 use it when your are computing the second argument of PyUnicode_New() */
116#define MAX_MAXCHAR(maxchar1, maxchar2) \
117 ((maxchar1) | (maxchar2))
118
Victor Stinner910337b2011-10-03 03:20:16 +0200119#undef PyUnicode_READY
120#define PyUnicode_READY(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200123 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100124 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200125
Victor Stinnerc379ead2011-10-03 12:52:27 +0200126#define _PyUnicode_SHARE_UTF8(op) \
127 (assert(_PyUnicode_CHECK(op)), \
128 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
129 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
130#define _PyUnicode_SHARE_WSTR(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
133
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134/* true if the Unicode object has an allocated UTF-8 memory block
135 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200136#define _PyUnicode_HAS_UTF8_MEMORY(op) \
137 (assert(_PyUnicode_CHECK(op)), \
138 (!PyUnicode_IS_COMPACT_ASCII(op) \
139 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200140 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
141
Victor Stinner03490912011-10-03 23:45:12 +0200142/* true if the Unicode object has an allocated wstr memory block
143 (not shared with other data) */
144#define _PyUnicode_HAS_WSTR_MEMORY(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 (_PyUnicode_WSTR(op) && \
147 (!PyUnicode_IS_READY(op) || \
148 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
149
Victor Stinner910337b2011-10-03 03:20:16 +0200150/* Generic helper macro to convert characters of different types.
151 from_type and to_type have to be valid type names, begin and end
152 are pointers to the source characters which should be of type
153 "from_type *". to is a pointer of type "to_type *" and points to the
154 buffer where the result characters are written to. */
155#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
156 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200157 to_type *_to = (to_type *) to; \
158 const from_type *_iter = (begin); \
159 const from_type *_end = (end); \
160 Py_ssize_t n = (_end) - (_iter); \
161 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200162 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200163 while (_iter < (_unrolled_end)) { \
164 _to[0] = (to_type) _iter[0]; \
165 _to[1] = (to_type) _iter[1]; \
166 _to[2] = (to_type) _iter[2]; \
167 _to[3] = (to_type) _iter[3]; \
168 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200169 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200170 while (_iter < (_end)) \
171 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173
Walter Dörwald16807132007-05-25 13:52:07 +0000174/* This dictionary holds all interned unicode strings. Note that references
175 to strings in this dictionary are *not* counted in the string's ob_refcnt.
176 When the interned string reaches a refcnt of 0 the string deallocation
177 function will delete the reference from this dictionary.
178
179 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000180 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000181*/
182static PyObject *interned;
183
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000184/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200185static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000186
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200187/* List of static strings. */
188static _Py_Identifier *static_strings;
189
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190/* Single character Unicode strings in the Latin-1 range are being
191 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200192static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193
Christian Heimes190d79e2008-01-30 11:58:22 +0000194/* Fast detection of the most frequent whitespace characters */
195const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000196 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000197/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000198/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000199/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000200/* case 0x000C: * FORM FEED */
201/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000202 0, 1, 1, 1, 1, 1, 0, 0,
203 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000204/* case 0x001C: * FILE SEPARATOR */
205/* case 0x001D: * GROUP SEPARATOR */
206/* case 0x001E: * RECORD SEPARATOR */
207/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000208 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000209/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 1, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000214
Benjamin Peterson14339b62009-01-31 16:36:08 +0000215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000223};
224
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200225/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200226static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200227static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100228static int unicode_modifiable(PyObject *unicode);
229
Victor Stinnerfe226c02011-10-03 03:52:20 +0200230
Alexander Belopolsky40018472011-02-26 01:02:56 +0000231static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200232_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
233static PyObject *
234_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
235static PyObject *
236_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
237
238static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000239unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000240 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100241 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000242 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
243
Alexander Belopolsky40018472011-02-26 01:02:56 +0000244static void
245raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300246 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100247 PyObject *unicode,
248 Py_ssize_t startpos, Py_ssize_t endpos,
249 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000250
Christian Heimes190d79e2008-01-30 11:58:22 +0000251/* Same for linebreaks */
252static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000253 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000254/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000255/* 0x000B, * LINE TABULATION */
256/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000257/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000258 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000259 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000260/* 0x001C, * FILE SEPARATOR */
261/* 0x001D, * GROUP SEPARATOR */
262/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000263 0, 0, 0, 0, 1, 1, 1, 0,
264 0, 0, 0, 0, 0, 0, 0, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000268
Benjamin Peterson14339b62009-01-31 16:36:08 +0000269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000277};
278
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300279/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
280 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000281Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000282PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000283{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000284#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 /* This is actually an illegal character, so it should
288 not be passed to unichr. */
289 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000290#endif
291}
292
Victor Stinner910337b2011-10-03 03:20:16 +0200293#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200294int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100295_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200296{
297 PyASCIIObject *ascii;
298 unsigned int kind;
299
300 assert(PyUnicode_Check(op));
301
302 ascii = (PyASCIIObject *)op;
303 kind = ascii->state.kind;
304
Victor Stinnera3b334d2011-10-03 13:53:37 +0200305 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200306 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(ascii->state.ready == 1);
308 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200309 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200310 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200311 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200312
Victor Stinnera41463c2011-10-04 01:05:08 +0200313 if (ascii->state.compact == 1) {
314 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200315 assert(kind == PyUnicode_1BYTE_KIND
316 || kind == PyUnicode_2BYTE_KIND
317 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200318 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200319 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200320 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100321 }
322 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325 data = unicode->data.any;
326 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100327 assert(ascii->length == 0);
328 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 assert(ascii->state.compact == 0);
330 assert(ascii->state.ascii == 0);
331 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100332 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 assert(ascii->wstr != NULL);
334 assert(data == NULL);
335 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 }
337 else {
338 assert(kind == PyUnicode_1BYTE_KIND
339 || kind == PyUnicode_2BYTE_KIND
340 || kind == PyUnicode_4BYTE_KIND);
341 assert(ascii->state.compact == 0);
342 assert(ascii->state.ready == 1);
343 assert(data != NULL);
344 if (ascii->state.ascii) {
345 assert (compact->utf8 == data);
346 assert (compact->utf8_length == ascii->length);
347 }
348 else
349 assert (compact->utf8 != data);
350 }
351 }
352 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200353 if (
354#if SIZEOF_WCHAR_T == 2
355 kind == PyUnicode_2BYTE_KIND
356#else
357 kind == PyUnicode_4BYTE_KIND
358#endif
359 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200360 {
361 assert(ascii->wstr == data);
362 assert(compact->wstr_length == ascii->length);
363 } else
364 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200365 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200366
367 if (compact->utf8 == NULL)
368 assert(compact->utf8_length == 0);
369 if (ascii->wstr == NULL)
370 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200371 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200372 /* check that the best kind is used */
373 if (check_content && kind != PyUnicode_WCHAR_KIND)
374 {
375 Py_ssize_t i;
376 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200377 void *data;
378 Py_UCS4 ch;
379
380 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200381 for (i=0; i < ascii->length; i++)
382 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200383 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200384 if (ch > maxchar)
385 maxchar = ch;
386 }
387 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100388 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200389 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100390 assert(maxchar <= 255);
391 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 else
393 assert(maxchar < 128);
394 }
Victor Stinner77faf692011-11-20 18:56:05 +0100395 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200396 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100397 assert(maxchar <= 0xFFFF);
398 }
399 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100401 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100402 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200403 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400405 return 1;
406}
Victor Stinner910337b2011-10-03 03:20:16 +0200407#endif
408
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100409static PyObject*
410unicode_result_wchar(PyObject *unicode)
411{
412#ifndef Py_DEBUG
413 Py_ssize_t len;
414
415 assert(Py_REFCNT(unicode) == 1);
416
417 len = _PyUnicode_WSTR_LENGTH(unicode);
418 if (len == 0) {
419 Py_INCREF(unicode_empty);
420 Py_DECREF(unicode);
421 return unicode_empty;
422 }
423
424 if (len == 1) {
425 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
426 if (ch < 256) {
427 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
428 Py_DECREF(unicode);
429 return latin1_char;
430 }
431 }
432
433 if (_PyUnicode_Ready(unicode) < 0) {
434 Py_XDECREF(unicode);
435 return NULL;
436 }
437#else
438 /* don't make the result ready in debug mode to ensure that the caller
439 makes the string ready before using it */
440 assert(_PyUnicode_CheckConsistency(unicode, 1));
441#endif
442 return unicode;
443}
444
445static PyObject*
446unicode_result_ready(PyObject *unicode)
447{
448 Py_ssize_t length;
449
450 length = PyUnicode_GET_LENGTH(unicode);
451 if (length == 0) {
452 if (unicode != unicode_empty) {
453 Py_INCREF(unicode_empty);
454 Py_DECREF(unicode);
455 }
456 return unicode_empty;
457 }
458
459 if (length == 1) {
460 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
461 if (ch < 256) {
462 PyObject *latin1_char = unicode_latin1[ch];
463 if (latin1_char != NULL) {
464 if (unicode != latin1_char) {
465 Py_INCREF(latin1_char);
466 Py_DECREF(unicode);
467 }
468 return latin1_char;
469 }
470 else {
471 assert(_PyUnicode_CheckConsistency(unicode, 1));
472 Py_INCREF(unicode);
473 unicode_latin1[ch] = unicode;
474 return unicode;
475 }
476 }
477 }
478
479 assert(_PyUnicode_CheckConsistency(unicode, 1));
480 return unicode;
481}
482
483static PyObject*
484unicode_result(PyObject *unicode)
485{
486 assert(_PyUnicode_CHECK(unicode));
487 if (PyUnicode_IS_READY(unicode))
488 return unicode_result_ready(unicode);
489 else
490 return unicode_result_wchar(unicode);
491}
492
Victor Stinnerc4b49542011-12-11 22:44:26 +0100493static PyObject*
494unicode_result_unchanged(PyObject *unicode)
495{
496 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500497 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100498 return NULL;
499 Py_INCREF(unicode);
500 return unicode;
501 }
502 else
503 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100504 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100505}
506
Victor Stinner3a50e702011-10-18 21:21:00 +0200507#ifdef HAVE_MBCS
508static OSVERSIONINFOEX winver;
509#endif
510
Thomas Wouters477c8d52006-05-27 19:21:47 +0000511/* --- Bloom Filters ----------------------------------------------------- */
512
513/* stuff to implement simple "bloom filters" for Unicode characters.
514 to keep things simple, we use a single bitmask, using the least 5
515 bits from each unicode characters as the bit index. */
516
517/* the linebreak mask is set up by Unicode_Init below */
518
Antoine Pitrouf068f942010-01-13 14:19:12 +0000519#if LONG_BIT >= 128
520#define BLOOM_WIDTH 128
521#elif LONG_BIT >= 64
522#define BLOOM_WIDTH 64
523#elif LONG_BIT >= 32
524#define BLOOM_WIDTH 32
525#else
526#error "LONG_BIT is smaller than 32"
527#endif
528
Thomas Wouters477c8d52006-05-27 19:21:47 +0000529#define BLOOM_MASK unsigned long
530
531static BLOOM_MASK bloom_linebreak;
532
Antoine Pitrouf068f942010-01-13 14:19:12 +0000533#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
534#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000535
Benjamin Peterson29060642009-01-31 22:14:21 +0000536#define BLOOM_LINEBREAK(ch) \
537 ((ch) < 128U ? ascii_linebreak[(ch)] : \
538 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000539
Alexander Belopolsky40018472011-02-26 01:02:56 +0000540Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200541make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542{
543 /* calculate simple bloom-style bitmask for a given unicode string */
544
Antoine Pitrouf068f942010-01-13 14:19:12 +0000545 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000546 Py_ssize_t i;
547
548 mask = 0;
549 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
552 return mask;
553}
554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555#define BLOOM_MEMBER(mask, chr, str) \
556 (BLOOM(mask, chr) \
557 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000558
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200559/* Compilation of templated routines */
560
561#include "stringlib/asciilib.h"
562#include "stringlib/fastsearch.h"
563#include "stringlib/partition.h"
564#include "stringlib/split.h"
565#include "stringlib/count.h"
566#include "stringlib/find.h"
567#include "stringlib/find_max_char.h"
568#include "stringlib/localeutil.h"
569#include "stringlib/undef.h"
570
571#include "stringlib/ucs1lib.h"
572#include "stringlib/fastsearch.h"
573#include "stringlib/partition.h"
574#include "stringlib/split.h"
575#include "stringlib/count.h"
576#include "stringlib/find.h"
577#include "stringlib/find_max_char.h"
578#include "stringlib/localeutil.h"
579#include "stringlib/undef.h"
580
581#include "stringlib/ucs2lib.h"
582#include "stringlib/fastsearch.h"
583#include "stringlib/partition.h"
584#include "stringlib/split.h"
585#include "stringlib/count.h"
586#include "stringlib/find.h"
587#include "stringlib/find_max_char.h"
588#include "stringlib/localeutil.h"
589#include "stringlib/undef.h"
590
591#include "stringlib/ucs4lib.h"
592#include "stringlib/fastsearch.h"
593#include "stringlib/partition.h"
594#include "stringlib/split.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
597#include "stringlib/find_max_char.h"
598#include "stringlib/localeutil.h"
599#include "stringlib/undef.h"
600
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200601#include "stringlib/unicodedefs.h"
602#include "stringlib/fastsearch.h"
603#include "stringlib/count.h"
604#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100605#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200606
Guido van Rossumd57fd912000-03-10 22:53:23 +0000607/* --- Unicode Object ----------------------------------------------------- */
608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200609static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200610fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200611
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200612Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
613 Py_ssize_t size, Py_UCS4 ch,
614 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200615{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200616 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
617
618 switch (kind) {
619 case PyUnicode_1BYTE_KIND:
620 {
621 Py_UCS1 ch1 = (Py_UCS1) ch;
622 if (ch1 == ch)
623 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
624 else
625 return -1;
626 }
627 case PyUnicode_2BYTE_KIND:
628 {
629 Py_UCS2 ch2 = (Py_UCS2) ch;
630 if (ch2 == ch)
631 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
632 else
633 return -1;
634 }
635 case PyUnicode_4BYTE_KIND:
636 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
637 default:
638 assert(0);
639 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200640 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200641}
642
Victor Stinnerfe226c02011-10-03 03:52:20 +0200643static PyObject*
644resize_compact(PyObject *unicode, Py_ssize_t length)
645{
646 Py_ssize_t char_size;
647 Py_ssize_t struct_size;
648 Py_ssize_t new_size;
649 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100650 PyObject *new_unicode;
Victor Stinner79891572012-05-03 13:43:07 +0200651 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200652 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100653 assert(PyUnicode_IS_COMPACT(unicode));
654
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200655 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100656 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 struct_size = sizeof(PyASCIIObject);
658 else
659 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200660 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
663 PyErr_NoMemory();
664 return NULL;
665 }
666 new_size = (struct_size + (length + 1) * char_size);
667
Victor Stinner84def372011-12-11 20:04:56 +0100668 _Py_DEC_REFTOTAL;
669 _Py_ForgetReference(unicode);
670
671 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
672 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100673 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200674 PyErr_NoMemory();
675 return NULL;
676 }
Victor Stinner84def372011-12-11 20:04:56 +0100677 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100679
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200681 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200682 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100683 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200684 _PyUnicode_WSTR_LENGTH(unicode) = length;
685 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
687 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200688 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200689 return unicode;
690}
691
Alexander Belopolsky40018472011-02-26 01:02:56 +0000692static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200693resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694{
Victor Stinner95663112011-10-04 01:03:50 +0200695 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100696 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200697 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000699
Victor Stinnerfe226c02011-10-03 03:52:20 +0200700 if (PyUnicode_IS_READY(unicode)) {
701 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200702 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200703 void *data;
704
705 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200706 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200707 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
708 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709
710 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
711 PyErr_NoMemory();
712 return -1;
713 }
714 new_size = (length + 1) * char_size;
715
Victor Stinner7a9105a2011-12-12 00:13:42 +0100716 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
717 {
718 PyObject_DEL(_PyUnicode_UTF8(unicode));
719 _PyUnicode_UTF8(unicode) = NULL;
720 _PyUnicode_UTF8_LENGTH(unicode) = 0;
721 }
722
Victor Stinnerfe226c02011-10-03 03:52:20 +0200723 data = (PyObject *)PyObject_REALLOC(data, new_size);
724 if (data == NULL) {
725 PyErr_NoMemory();
726 return -1;
727 }
728 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200729 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200730 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200731 _PyUnicode_WSTR_LENGTH(unicode) = length;
732 }
733 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200734 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200735 _PyUnicode_UTF8_LENGTH(unicode) = length;
736 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _PyUnicode_LENGTH(unicode) = length;
738 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200739 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200740 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200743 }
Victor Stinner95663112011-10-04 01:03:50 +0200744 assert(_PyUnicode_WSTR(unicode) != NULL);
745
746 /* check for integer overflow */
747 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
748 PyErr_NoMemory();
749 return -1;
750 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100751 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200752 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100753 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200754 if (!wstr) {
755 PyErr_NoMemory();
756 return -1;
757 }
758 _PyUnicode_WSTR(unicode) = wstr;
759 _PyUnicode_WSTR(unicode)[length] = 0;
760 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200761 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000762 return 0;
763}
764
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765static PyObject*
766resize_copy(PyObject *unicode, Py_ssize_t length)
767{
768 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100769 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200770 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100771
Benjamin Petersonbac79492012-01-14 13:34:47 -0500772 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100773 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200774
775 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
776 if (copy == NULL)
777 return NULL;
778
779 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200780 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200781 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200782 }
783 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200784 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200786 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200787 if (w == NULL)
788 return NULL;
789 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
790 copy_length = Py_MIN(copy_length, length);
791 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
792 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200793 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200794 }
795}
796
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000798 Ux0000 terminated; some code (e.g. new_identifier)
799 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000800
801 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000802 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000803
804*/
805
Alexander Belopolsky40018472011-02-26 01:02:56 +0000806static PyUnicodeObject *
807_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808{
809 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200810 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811
Thomas Wouters477c8d52006-05-27 19:21:47 +0000812 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000813 if (length == 0 && unicode_empty != NULL) {
814 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200815 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000816 }
817
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000818 /* Ensure we won't overflow the size. */
819 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
820 return (PyUnicodeObject *)PyErr_NoMemory();
821 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200822 if (length < 0) {
823 PyErr_SetString(PyExc_SystemError,
824 "Negative size passed to _PyUnicode_New");
825 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000826 }
827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
829 if (unicode == NULL)
830 return NULL;
831 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
832 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
833 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100834 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000835 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100836 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000837 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200838
Jeremy Hyltond8082792003-09-16 19:41:39 +0000839 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000840 * the caller fails before initializing str -- unicode_resize()
841 * reads str[0], and the Keep-Alive optimization can keep memory
842 * allocated for str alive across a call to unicode_dealloc(unicode).
843 * We don't want unicode_resize to read uninitialized memory in
844 * that case.
845 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200846 _PyUnicode_WSTR(unicode)[0] = 0;
847 _PyUnicode_WSTR(unicode)[length] = 0;
848 _PyUnicode_WSTR_LENGTH(unicode) = length;
849 _PyUnicode_HASH(unicode) = -1;
850 _PyUnicode_STATE(unicode).interned = 0;
851 _PyUnicode_STATE(unicode).kind = 0;
852 _PyUnicode_STATE(unicode).compact = 0;
853 _PyUnicode_STATE(unicode).ready = 0;
854 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200855 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200856 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200857 _PyUnicode_UTF8(unicode) = NULL;
858 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100859 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000860 return unicode;
861}
862
Victor Stinnerf42dc442011-10-02 23:33:16 +0200863static const char*
864unicode_kind_name(PyObject *unicode)
865{
Victor Stinner42dfd712011-10-03 14:41:45 +0200866 /* don't check consistency: unicode_kind_name() is called from
867 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200868 if (!PyUnicode_IS_COMPACT(unicode))
869 {
870 if (!PyUnicode_IS_READY(unicode))
871 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600872 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200873 {
874 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200875 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200876 return "legacy ascii";
877 else
878 return "legacy latin1";
879 case PyUnicode_2BYTE_KIND:
880 return "legacy UCS2";
881 case PyUnicode_4BYTE_KIND:
882 return "legacy UCS4";
883 default:
884 return "<legacy invalid kind>";
885 }
886 }
887 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600888 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200889 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200890 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200891 return "ascii";
892 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200895 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200896 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200897 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200898 default:
899 return "<invalid compact kind>";
900 }
901}
902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200903#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200904/* Functions wrapping macros for use in debugger */
905char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200906 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200907}
908
909void *_PyUnicode_compact_data(void *unicode) {
910 return _PyUnicode_COMPACT_DATA(unicode);
911}
912void *_PyUnicode_data(void *unicode){
913 printf("obj %p\n", unicode);
914 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
915 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
916 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
917 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
918 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
919 return PyUnicode_DATA(unicode);
920}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200921
922void
923_PyUnicode_Dump(PyObject *op)
924{
925 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200926 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
927 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
928 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200929
Victor Stinnera849a4b2011-10-03 12:12:11 +0200930 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200931 {
932 if (ascii->state.ascii)
933 data = (ascii + 1);
934 else
935 data = (compact + 1);
936 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200937 else
938 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200939 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
940
Victor Stinnera849a4b2011-10-03 12:12:11 +0200941 if (ascii->wstr == data)
942 printf("shared ");
943 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200944
Victor Stinnera3b334d2011-10-03 13:53:37 +0200945 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200946 printf(" (%zu), ", compact->wstr_length);
947 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
948 printf("shared ");
949 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200950 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200951 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200952}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200953#endif
954
955PyObject *
956PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
957{
958 PyObject *obj;
959 PyCompactUnicodeObject *unicode;
960 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200961 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200962 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963 Py_ssize_t char_size;
964 Py_ssize_t struct_size;
965
966 /* Optimization for empty strings */
967 if (size == 0 && unicode_empty != NULL) {
968 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200969 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970 }
971
Victor Stinner9e9d6892011-10-04 01:02:02 +0200972 is_ascii = 0;
973 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974 struct_size = sizeof(PyCompactUnicodeObject);
975 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200976 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977 char_size = 1;
978 is_ascii = 1;
979 struct_size = sizeof(PyASCIIObject);
980 }
981 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +0200982 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200983 char_size = 1;
984 }
985 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +0200986 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200987 char_size = 2;
988 if (sizeof(wchar_t) == 2)
989 is_sharing = 1;
990 }
991 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +0100992 if (maxchar > MAX_UNICODE) {
993 PyErr_SetString(PyExc_SystemError,
994 "invalid maximum character passed to PyUnicode_New");
995 return NULL;
996 }
Victor Stinner8f825062012-04-27 13:55:39 +0200997 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200998 char_size = 4;
999 if (sizeof(wchar_t) == 4)
1000 is_sharing = 1;
1001 }
1002
1003 /* Ensure we won't overflow the size. */
1004 if (size < 0) {
1005 PyErr_SetString(PyExc_SystemError,
1006 "Negative size passed to PyUnicode_New");
1007 return NULL;
1008 }
1009 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1010 return PyErr_NoMemory();
1011
1012 /* Duplicated allocation code from _PyObject_New() instead of a call to
1013 * PyObject_New() so we are able to allocate space for the object and
1014 * it's data buffer.
1015 */
1016 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1017 if (obj == NULL)
1018 return PyErr_NoMemory();
1019 obj = PyObject_INIT(obj, &PyUnicode_Type);
1020 if (obj == NULL)
1021 return NULL;
1022
1023 unicode = (PyCompactUnicodeObject *)obj;
1024 if (is_ascii)
1025 data = ((PyASCIIObject*)obj) + 1;
1026 else
1027 data = unicode + 1;
1028 _PyUnicode_LENGTH(unicode) = size;
1029 _PyUnicode_HASH(unicode) = -1;
1030 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001031 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001032 _PyUnicode_STATE(unicode).compact = 1;
1033 _PyUnicode_STATE(unicode).ready = 1;
1034 _PyUnicode_STATE(unicode).ascii = is_ascii;
1035 if (is_ascii) {
1036 ((char*)data)[size] = 0;
1037 _PyUnicode_WSTR(unicode) = NULL;
1038 }
Victor Stinner8f825062012-04-27 13:55:39 +02001039 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040 ((char*)data)[size] = 0;
1041 _PyUnicode_WSTR(unicode) = NULL;
1042 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001043 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001044 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001045 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001046 else {
1047 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001048 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001049 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001051 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 ((Py_UCS4*)data)[size] = 0;
1053 if (is_sharing) {
1054 _PyUnicode_WSTR_LENGTH(unicode) = size;
1055 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1056 }
1057 else {
1058 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1059 _PyUnicode_WSTR(unicode) = NULL;
1060 }
1061 }
Victor Stinner8f825062012-04-27 13:55:39 +02001062#ifdef Py_DEBUG
1063 /* Fill the data with invalid characters to detect bugs earlier.
1064 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1065 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1066 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1067 memset(data, 0xff, size * kind);
1068#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001069 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001070 return obj;
1071}
1072
1073#if SIZEOF_WCHAR_T == 2
1074/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1075 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001076 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001077
1078 This function assumes that unicode can hold one more code point than wstr
1079 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001080static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001081unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001082 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001083{
1084 const wchar_t *iter;
1085 Py_UCS4 *ucs4_out;
1086
Victor Stinner910337b2011-10-03 03:20:16 +02001087 assert(unicode != NULL);
1088 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001089 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1090 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1091
1092 for (iter = begin; iter < end; ) {
1093 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1094 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001095 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1096 && (iter+1) < end
1097 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001098 {
Victor Stinner551ac952011-11-29 22:58:13 +01001099 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100 iter += 2;
1101 }
1102 else {
1103 *ucs4_out++ = *iter;
1104 iter++;
1105 }
1106 }
1107 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1108 _PyUnicode_GET_LENGTH(unicode)));
1109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001110}
1111#endif
1112
Victor Stinnercd9950f2011-10-02 00:34:53 +02001113static int
Victor Stinner488fa492011-12-12 00:01:39 +01001114unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001115{
Victor Stinner488fa492011-12-12 00:01:39 +01001116 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001117 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001118 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001119 return -1;
1120 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001121 return 0;
1122}
1123
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001124static int
1125_copy_characters(PyObject *to, Py_ssize_t to_start,
1126 PyObject *from, Py_ssize_t from_start,
1127 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001128{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001129 unsigned int from_kind, to_kind;
1130 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001131
Victor Stinneree4544c2012-05-09 22:24:08 +02001132 assert(0 <= how_many);
1133 assert(0 <= from_start);
1134 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001136 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001137 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001138
Victor Stinnerd3f08822012-05-29 12:57:52 +02001139 assert(PyUnicode_Check(to));
1140 assert(PyUnicode_IS_READY(to));
1141 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1142
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001143 if (how_many == 0)
1144 return 0;
1145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001147 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001149 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150
Victor Stinnerf1852262012-06-16 16:38:26 +02001151#ifdef Py_DEBUG
1152 if (!check_maxchar
1153 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1154 {
1155 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1156 Py_UCS4 ch;
1157 Py_ssize_t i;
1158 for (i=0; i < how_many; i++) {
1159 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1160 assert(ch <= to_maxchar);
1161 }
1162 }
1163#endif
1164
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001165 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001166 if (check_maxchar
1167 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1168 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001169 /* Writing Latin-1 characters into an ASCII string requires to
1170 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001171 Py_UCS4 max_char;
1172 max_char = ucs1lib_find_max_char(from_data,
1173 (Py_UCS1*)from_data + how_many);
1174 if (max_char >= 128)
1175 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001176 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001177 Py_MEMCPY((char*)to_data + to_kind * to_start,
1178 (char*)from_data + from_kind * from_start,
1179 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001180 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001181 else if (from_kind == PyUnicode_1BYTE_KIND
1182 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001183 {
1184 _PyUnicode_CONVERT_BYTES(
1185 Py_UCS1, Py_UCS2,
1186 PyUnicode_1BYTE_DATA(from) + from_start,
1187 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1188 PyUnicode_2BYTE_DATA(to) + to_start
1189 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001190 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001191 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001192 && to_kind == PyUnicode_4BYTE_KIND)
1193 {
1194 _PyUnicode_CONVERT_BYTES(
1195 Py_UCS1, Py_UCS4,
1196 PyUnicode_1BYTE_DATA(from) + from_start,
1197 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1198 PyUnicode_4BYTE_DATA(to) + to_start
1199 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001200 }
1201 else if (from_kind == PyUnicode_2BYTE_KIND
1202 && to_kind == PyUnicode_4BYTE_KIND)
1203 {
1204 _PyUnicode_CONVERT_BYTES(
1205 Py_UCS2, Py_UCS4,
1206 PyUnicode_2BYTE_DATA(from) + from_start,
1207 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1208 PyUnicode_4BYTE_DATA(to) + to_start
1209 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001210 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001211 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001212 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1213
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001214 if (!check_maxchar) {
1215 if (from_kind == PyUnicode_2BYTE_KIND
1216 && to_kind == PyUnicode_1BYTE_KIND)
1217 {
1218 _PyUnicode_CONVERT_BYTES(
1219 Py_UCS2, Py_UCS1,
1220 PyUnicode_2BYTE_DATA(from) + from_start,
1221 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1222 PyUnicode_1BYTE_DATA(to) + to_start
1223 );
1224 }
1225 else if (from_kind == PyUnicode_4BYTE_KIND
1226 && to_kind == PyUnicode_1BYTE_KIND)
1227 {
1228 _PyUnicode_CONVERT_BYTES(
1229 Py_UCS4, Py_UCS1,
1230 PyUnicode_4BYTE_DATA(from) + from_start,
1231 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1232 PyUnicode_1BYTE_DATA(to) + to_start
1233 );
1234 }
1235 else if (from_kind == PyUnicode_4BYTE_KIND
1236 && to_kind == PyUnicode_2BYTE_KIND)
1237 {
1238 _PyUnicode_CONVERT_BYTES(
1239 Py_UCS4, Py_UCS2,
1240 PyUnicode_4BYTE_DATA(from) + from_start,
1241 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1242 PyUnicode_2BYTE_DATA(to) + to_start
1243 );
1244 }
1245 else {
1246 assert(0);
1247 return -1;
1248 }
1249 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001250 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001251 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001252 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001253 Py_ssize_t i;
1254
Victor Stinnera0702ab2011-09-29 14:14:38 +02001255 for (i=0; i < how_many; i++) {
1256 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001257 if (ch > to_maxchar)
1258 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001259 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1260 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001261 }
1262 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001263 return 0;
1264}
1265
Victor Stinnerd3f08822012-05-29 12:57:52 +02001266void
1267_PyUnicode_FastCopyCharacters(
1268 PyObject *to, Py_ssize_t to_start,
1269 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001270{
1271 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1272}
1273
1274Py_ssize_t
1275PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1276 PyObject *from, Py_ssize_t from_start,
1277 Py_ssize_t how_many)
1278{
1279 int err;
1280
1281 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1282 PyErr_BadInternalCall();
1283 return -1;
1284 }
1285
Benjamin Petersonbac79492012-01-14 13:34:47 -05001286 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001287 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001288 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001289 return -1;
1290
Victor Stinnerd3f08822012-05-29 12:57:52 +02001291 if (from_start < 0) {
1292 PyErr_SetString(PyExc_IndexError, "string index out of range");
1293 return -1;
1294 }
1295 if (to_start < 0) {
1296 PyErr_SetString(PyExc_IndexError, "string index out of range");
1297 return -1;
1298 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001299 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1300 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1301 PyErr_Format(PyExc_SystemError,
1302 "Cannot write %zi characters at %zi "
1303 "in a string of %zi characters",
1304 how_many, to_start, PyUnicode_GET_LENGTH(to));
1305 return -1;
1306 }
1307
1308 if (how_many == 0)
1309 return 0;
1310
Victor Stinner488fa492011-12-12 00:01:39 +01001311 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001312 return -1;
1313
1314 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1315 if (err) {
1316 PyErr_Format(PyExc_SystemError,
1317 "Cannot copy %s characters "
1318 "into a string of %s characters",
1319 unicode_kind_name(from),
1320 unicode_kind_name(to));
1321 return -1;
1322 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001323 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324}
1325
Victor Stinner17222162011-09-28 22:15:37 +02001326/* Find the maximum code point and count the number of surrogate pairs so a
1327 correct string length can be computed before converting a string to UCS4.
1328 This function counts single surrogates as a character and not as a pair.
1329
1330 Return 0 on success, or -1 on error. */
1331static int
1332find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1333 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334{
1335 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001336 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337
Victor Stinnerc53be962011-10-02 21:33:54 +02001338 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001339 *num_surrogates = 0;
1340 *maxchar = 0;
1341
1342 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001344 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1345 && (iter+1) < end
1346 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001348 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001349 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 iter += 2;
1351 }
1352 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001354 {
1355 ch = *iter;
1356 iter++;
1357 }
1358 if (ch > *maxchar) {
1359 *maxchar = ch;
1360 if (*maxchar > MAX_UNICODE) {
1361 PyErr_Format(PyExc_ValueError,
1362 "character U+%x is not in range [U+0000; U+10ffff]",
1363 ch);
1364 return -1;
1365 }
1366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001367 }
1368 return 0;
1369}
1370
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001371int
1372_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373{
1374 wchar_t *end;
1375 Py_UCS4 maxchar = 0;
1376 Py_ssize_t num_surrogates;
1377#if SIZEOF_WCHAR_T == 2
1378 Py_ssize_t length_wo_surrogates;
1379#endif
1380
Georg Brandl7597add2011-10-05 16:36:47 +02001381 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001382 strings were created using _PyObject_New() and where no canonical
1383 representation (the str field) has been set yet aka strings
1384 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001385 assert(_PyUnicode_CHECK(unicode));
1386 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001388 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001389 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001390 /* Actually, it should neither be interned nor be anything else: */
1391 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001394 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001395 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397
1398 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001399 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1400 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 PyErr_NoMemory();
1402 return -1;
1403 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001404 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 _PyUnicode_WSTR(unicode), end,
1406 PyUnicode_1BYTE_DATA(unicode));
1407 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1408 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1409 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1410 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001411 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001412 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001413 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 }
1415 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001416 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001417 _PyUnicode_UTF8(unicode) = NULL;
1418 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 }
1420 PyObject_FREE(_PyUnicode_WSTR(unicode));
1421 _PyUnicode_WSTR(unicode) = NULL;
1422 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1423 }
1424 /* In this case we might have to convert down from 4-byte native
1425 wchar_t to 2-byte unicode. */
1426 else if (maxchar < 65536) {
1427 assert(num_surrogates == 0 &&
1428 "FindMaxCharAndNumSurrogatePairs() messed up");
1429
Victor Stinner506f5922011-09-28 22:34:18 +02001430#if SIZEOF_WCHAR_T == 2
1431 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001432 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001433 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1434 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1435 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001436 _PyUnicode_UTF8(unicode) = NULL;
1437 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001438#else
1439 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001440 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001441 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001442 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001443 PyErr_NoMemory();
1444 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 }
Victor Stinner506f5922011-09-28 22:34:18 +02001446 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1447 _PyUnicode_WSTR(unicode), end,
1448 PyUnicode_2BYTE_DATA(unicode));
1449 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1450 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1451 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001452 _PyUnicode_UTF8(unicode) = NULL;
1453 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001454 PyObject_FREE(_PyUnicode_WSTR(unicode));
1455 _PyUnicode_WSTR(unicode) = NULL;
1456 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1457#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 }
1459 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1460 else {
1461#if SIZEOF_WCHAR_T == 2
1462 /* in case the native representation is 2-bytes, we need to allocate a
1463 new normalized 4-byte version. */
1464 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001465 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1466 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 PyErr_NoMemory();
1468 return -1;
1469 }
1470 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1471 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001472 _PyUnicode_UTF8(unicode) = NULL;
1473 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001474 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1475 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001476 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477 PyObject_FREE(_PyUnicode_WSTR(unicode));
1478 _PyUnicode_WSTR(unicode) = NULL;
1479 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1480#else
1481 assert(num_surrogates == 0);
1482
Victor Stinnerc3c74152011-10-02 20:39:55 +02001483 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001485 _PyUnicode_UTF8(unicode) = NULL;
1486 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1488#endif
1489 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1490 }
1491 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001492 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001493 return 0;
1494}
1495
Alexander Belopolsky40018472011-02-26 01:02:56 +00001496static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001497unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498{
Walter Dörwald16807132007-05-25 13:52:07 +00001499 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001500 case SSTATE_NOT_INTERNED:
1501 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001502
Benjamin Peterson29060642009-01-31 22:14:21 +00001503 case SSTATE_INTERNED_MORTAL:
1504 /* revive dead object temporarily for DelItem */
1505 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001506 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001507 Py_FatalError(
1508 "deletion of interned string failed");
1509 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001510
Benjamin Peterson29060642009-01-31 22:14:21 +00001511 case SSTATE_INTERNED_IMMORTAL:
1512 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001513
Benjamin Peterson29060642009-01-31 22:14:21 +00001514 default:
1515 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001516 }
1517
Victor Stinner03490912011-10-03 23:45:12 +02001518 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001519 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001520 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001521 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001522 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1523 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001524
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001525 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526}
1527
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001528#ifdef Py_DEBUG
1529static int
1530unicode_is_singleton(PyObject *unicode)
1531{
1532 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1533 if (unicode == unicode_empty)
1534 return 1;
1535 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1536 {
1537 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1538 if (ch < 256 && unicode_latin1[ch] == unicode)
1539 return 1;
1540 }
1541 return 0;
1542}
1543#endif
1544
Alexander Belopolsky40018472011-02-26 01:02:56 +00001545static int
Victor Stinner488fa492011-12-12 00:01:39 +01001546unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001547{
Victor Stinner488fa492011-12-12 00:01:39 +01001548 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001549 if (Py_REFCNT(unicode) != 1)
1550 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001551 if (_PyUnicode_HASH(unicode) != -1)
1552 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001553 if (PyUnicode_CHECK_INTERNED(unicode))
1554 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001555 if (!PyUnicode_CheckExact(unicode))
1556 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001557#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001558 /* singleton refcount is greater than 1 */
1559 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001560#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001561 return 1;
1562}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001563
Victor Stinnerfe226c02011-10-03 03:52:20 +02001564static int
1565unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1566{
1567 PyObject *unicode;
1568 Py_ssize_t old_length;
1569
1570 assert(p_unicode != NULL);
1571 unicode = *p_unicode;
1572
1573 assert(unicode != NULL);
1574 assert(PyUnicode_Check(unicode));
1575 assert(0 <= length);
1576
Victor Stinner910337b2011-10-03 03:20:16 +02001577 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001578 old_length = PyUnicode_WSTR_LENGTH(unicode);
1579 else
1580 old_length = PyUnicode_GET_LENGTH(unicode);
1581 if (old_length == length)
1582 return 0;
1583
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001584 if (length == 0) {
1585 Py_DECREF(*p_unicode);
1586 *p_unicode = unicode_empty;
1587 Py_INCREF(*p_unicode);
1588 return 0;
1589 }
1590
Victor Stinner488fa492011-12-12 00:01:39 +01001591 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001592 PyObject *copy = resize_copy(unicode, length);
1593 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001594 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001595 Py_DECREF(*p_unicode);
1596 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001597 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001598 }
1599
Victor Stinnerfe226c02011-10-03 03:52:20 +02001600 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001601 PyObject *new_unicode = resize_compact(unicode, length);
1602 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001603 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001604 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001605 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001606 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001607 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001608}
1609
Alexander Belopolsky40018472011-02-26 01:02:56 +00001610int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001611PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001612{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001613 PyObject *unicode;
1614 if (p_unicode == NULL) {
1615 PyErr_BadInternalCall();
1616 return -1;
1617 }
1618 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001619 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 {
1621 PyErr_BadInternalCall();
1622 return -1;
1623 }
1624 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001625}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001626
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001627static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001628unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1629 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001630{
1631 PyObject *result;
1632 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001633 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001634 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1635 return 0;
1636 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1637 maxchar);
1638 if (result == NULL)
1639 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001640 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001641 Py_DECREF(*p_unicode);
1642 *p_unicode = result;
1643 return 0;
1644}
1645
1646static int
1647unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1648 Py_UCS4 ch)
1649{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001650 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001651 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001652 return -1;
1653 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1654 PyUnicode_DATA(*p_unicode),
1655 (*pos)++, ch);
1656 return 0;
1657}
1658
Victor Stinnerc5166102012-02-22 13:55:02 +01001659/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001660
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001661 WARNING: The function doesn't copy the terminating null character and
1662 doesn't check the maximum character (may write a latin1 character in an
1663 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001664static void
1665unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1666 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001667{
1668 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1669 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001670 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001671
1672 switch (kind) {
1673 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001674 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001675 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001676 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001677 }
1678 case PyUnicode_2BYTE_KIND: {
1679 Py_UCS2 *start = (Py_UCS2 *)data + index;
1680 Py_UCS2 *ucs2 = start;
1681 assert(index <= PyUnicode_GET_LENGTH(unicode));
1682
Victor Stinner184252a2012-06-16 02:57:41 +02001683 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001684 *ucs2 = (Py_UCS2)*str;
1685
1686 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001687 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001688 }
1689 default: {
1690 Py_UCS4 *start = (Py_UCS4 *)data + index;
1691 Py_UCS4 *ucs4 = start;
1692 assert(kind == PyUnicode_4BYTE_KIND);
1693 assert(index <= PyUnicode_GET_LENGTH(unicode));
1694
Victor Stinner184252a2012-06-16 02:57:41 +02001695 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001696 *ucs4 = (Py_UCS4)*str;
1697
1698 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001699 }
1700 }
1701}
1702
1703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001704static PyObject*
1705get_latin1_char(unsigned char ch)
1706{
Victor Stinnera464fc12011-10-02 20:39:30 +02001707 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001708 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001709 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001710 if (!unicode)
1711 return NULL;
1712 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001713 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001714 unicode_latin1[ch] = unicode;
1715 }
1716 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001717 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001718}
1719
Alexander Belopolsky40018472011-02-26 01:02:56 +00001720PyObject *
1721PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001723 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001724 Py_UCS4 maxchar = 0;
1725 Py_ssize_t num_surrogates;
1726
1727 if (u == NULL)
1728 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001730 /* If the Unicode data is known at construction time, we can apply
1731 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001733 /* Optimization for empty strings */
1734 if (size == 0 && unicode_empty != NULL) {
1735 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001736 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001737 }
Tim Petersced69f82003-09-16 20:30:58 +00001738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739 /* Single character Unicode objects in the Latin-1 range are
1740 shared when using this constructor */
1741 if (size == 1 && *u < 256)
1742 return get_latin1_char((unsigned char)*u);
1743
1744 /* If not empty and not single character, copy the Unicode data
1745 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001746 if (find_maxchar_surrogates(u, u + size,
1747 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 return NULL;
1749
Victor Stinner8faf8212011-12-08 22:14:11 +01001750 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 if (!unicode)
1752 return NULL;
1753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 switch (PyUnicode_KIND(unicode)) {
1755 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001756 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1758 break;
1759 case PyUnicode_2BYTE_KIND:
1760#if Py_UNICODE_SIZE == 2
1761 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1762#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001763 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1765#endif
1766 break;
1767 case PyUnicode_4BYTE_KIND:
1768#if SIZEOF_WCHAR_T == 2
1769 /* This is the only case which has to process surrogates, thus
1770 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001771 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001772#else
1773 assert(num_surrogates == 0);
1774 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1775#endif
1776 break;
1777 default:
1778 assert(0 && "Impossible state");
1779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001781 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782}
1783
Alexander Belopolsky40018472011-02-26 01:02:56 +00001784PyObject *
1785PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001786{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001787 if (size < 0) {
1788 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001789 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001790 return NULL;
1791 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001792 if (u != NULL)
1793 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1794 else
1795 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001796}
1797
Alexander Belopolsky40018472011-02-26 01:02:56 +00001798PyObject *
1799PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001800{
1801 size_t size = strlen(u);
1802 if (size > PY_SSIZE_T_MAX) {
1803 PyErr_SetString(PyExc_OverflowError, "input too long");
1804 return NULL;
1805 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001806 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001807}
1808
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001809PyObject *
1810_PyUnicode_FromId(_Py_Identifier *id)
1811{
1812 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001813 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1814 strlen(id->string),
1815 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001816 if (!id->object)
1817 return NULL;
1818 PyUnicode_InternInPlace(&id->object);
1819 assert(!id->next);
1820 id->next = static_strings;
1821 static_strings = id;
1822 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001823 return id->object;
1824}
1825
1826void
1827_PyUnicode_ClearStaticStrings()
1828{
1829 _Py_Identifier *i;
1830 for (i = static_strings; i; i = i->next) {
1831 Py_DECREF(i->object);
1832 i->object = NULL;
1833 i->next = NULL;
1834 }
1835}
1836
Benjamin Peterson0df54292012-03-26 14:50:32 -04001837/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001838
Victor Stinnerd3f08822012-05-29 12:57:52 +02001839PyObject*
1840_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001841{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001842 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001843 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001844 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001845#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001846 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001847#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001848 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001849 }
Victor Stinner785938e2011-12-11 20:09:03 +01001850 unicode = PyUnicode_New(size, 127);
1851 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001852 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001853 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1854 assert(_PyUnicode_CheckConsistency(unicode, 1));
1855 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001856}
1857
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001858static Py_UCS4
1859kind_maxchar_limit(unsigned int kind)
1860{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001861 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001862 case PyUnicode_1BYTE_KIND:
1863 return 0x80;
1864 case PyUnicode_2BYTE_KIND:
1865 return 0x100;
1866 case PyUnicode_4BYTE_KIND:
1867 return 0x10000;
1868 default:
1869 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001870 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001871 }
1872}
1873
Victor Stinnere6abb482012-05-02 01:15:40 +02001874Py_LOCAL_INLINE(Py_UCS4)
1875align_maxchar(Py_UCS4 maxchar)
1876{
1877 if (maxchar <= 127)
1878 return 127;
1879 else if (maxchar <= 255)
1880 return 255;
1881 else if (maxchar <= 65535)
1882 return 65535;
1883 else
1884 return MAX_UNICODE;
1885}
1886
Victor Stinner702c7342011-10-05 13:50:52 +02001887static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001888_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001889{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001890 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001891 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001892
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001893 if (size == 0) {
1894 Py_INCREF(unicode_empty);
1895 return unicode_empty;
1896 }
1897 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001898 if (size == 1)
1899 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001900
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001901 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001902 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001903 if (!res)
1904 return NULL;
1905 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001906 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001908}
1909
Victor Stinnere57b1c02011-09-28 22:20:48 +02001910static PyObject*
1911_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912{
1913 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001914 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001915
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001916 if (size == 0) {
1917 Py_INCREF(unicode_empty);
1918 return unicode_empty;
1919 }
1920 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001921 if (size == 1) {
1922 Py_UCS4 ch = u[0];
1923 if (ch < 256)
1924 return get_latin1_char((unsigned char)ch);
1925
1926 res = PyUnicode_New(1, ch);
1927 if (res == NULL)
1928 return NULL;
1929 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1930 assert(_PyUnicode_CheckConsistency(res, 1));
1931 return res;
1932 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001933
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001934 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001935 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936 if (!res)
1937 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001938 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001939 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001940 else {
1941 _PyUnicode_CONVERT_BYTES(
1942 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1943 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001944 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001945 return res;
1946}
1947
Victor Stinnere57b1c02011-09-28 22:20:48 +02001948static PyObject*
1949_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001950{
1951 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001952 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001953
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001954 if (size == 0) {
1955 Py_INCREF(unicode_empty);
1956 return unicode_empty;
1957 }
1958 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001959 if (size == 1) {
1960 Py_UCS4 ch = u[0];
1961 if (ch < 256)
1962 return get_latin1_char((unsigned char)ch);
1963
1964 res = PyUnicode_New(1, ch);
1965 if (res == NULL)
1966 return NULL;
1967 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1968 assert(_PyUnicode_CheckConsistency(res, 1));
1969 return res;
1970 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001971
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001972 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001973 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001974 if (!res)
1975 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001976 if (max_char < 256)
1977 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1978 PyUnicode_1BYTE_DATA(res));
1979 else if (max_char < 0x10000)
1980 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1981 PyUnicode_2BYTE_DATA(res));
1982 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001983 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001984 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985 return res;
1986}
1987
1988PyObject*
1989PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1990{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001991 if (size < 0) {
1992 PyErr_SetString(PyExc_ValueError, "size must be positive");
1993 return NULL;
1994 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06001995 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001996 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001997 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001998 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001999 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002001 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002002 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002003 PyErr_SetString(PyExc_SystemError, "invalid kind");
2004 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002006}
2007
Victor Stinnerece58de2012-04-23 23:36:38 +02002008Py_UCS4
2009_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2010{
2011 enum PyUnicode_Kind kind;
2012 void *startptr, *endptr;
2013
2014 assert(PyUnicode_IS_READY(unicode));
2015 assert(0 <= start);
2016 assert(end <= PyUnicode_GET_LENGTH(unicode));
2017 assert(start <= end);
2018
2019 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2020 return PyUnicode_MAX_CHAR_VALUE(unicode);
2021
2022 if (start == end)
2023 return 127;
2024
Victor Stinner94d558b2012-04-27 22:26:58 +02002025 if (PyUnicode_IS_ASCII(unicode))
2026 return 127;
2027
Victor Stinnerece58de2012-04-23 23:36:38 +02002028 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002029 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002030 endptr = (char *)startptr + end * kind;
2031 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002032 switch(kind) {
2033 case PyUnicode_1BYTE_KIND:
2034 return ucs1lib_find_max_char(startptr, endptr);
2035 case PyUnicode_2BYTE_KIND:
2036 return ucs2lib_find_max_char(startptr, endptr);
2037 case PyUnicode_4BYTE_KIND:
2038 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002039 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002040 assert(0);
2041 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002042 }
2043}
2044
Victor Stinner25a4b292011-10-06 12:31:55 +02002045/* Ensure that a string uses the most efficient storage, if it is not the
2046 case: create a new string with of the right kind. Write NULL into *p_unicode
2047 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002048static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002049unicode_adjust_maxchar(PyObject **p_unicode)
2050{
2051 PyObject *unicode, *copy;
2052 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002053 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002054 unsigned int kind;
2055
2056 assert(p_unicode != NULL);
2057 unicode = *p_unicode;
2058 assert(PyUnicode_IS_READY(unicode));
2059 if (PyUnicode_IS_ASCII(unicode))
2060 return;
2061
2062 len = PyUnicode_GET_LENGTH(unicode);
2063 kind = PyUnicode_KIND(unicode);
2064 if (kind == PyUnicode_1BYTE_KIND) {
2065 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002066 max_char = ucs1lib_find_max_char(u, u + len);
2067 if (max_char >= 128)
2068 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002069 }
2070 else if (kind == PyUnicode_2BYTE_KIND) {
2071 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002072 max_char = ucs2lib_find_max_char(u, u + len);
2073 if (max_char >= 256)
2074 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002075 }
2076 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002077 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002078 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002079 max_char = ucs4lib_find_max_char(u, u + len);
2080 if (max_char >= 0x10000)
2081 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002082 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002083 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002084 if (copy != NULL)
2085 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002086 Py_DECREF(unicode);
2087 *p_unicode = copy;
2088}
2089
Victor Stinner034f6cf2011-09-30 02:26:44 +02002090PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002091_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002092{
Victor Stinner87af4f22011-11-21 23:03:47 +01002093 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002094 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002095
Victor Stinner034f6cf2011-09-30 02:26:44 +02002096 if (!PyUnicode_Check(unicode)) {
2097 PyErr_BadInternalCall();
2098 return NULL;
2099 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002100 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002101 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002102
Victor Stinner87af4f22011-11-21 23:03:47 +01002103 length = PyUnicode_GET_LENGTH(unicode);
2104 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002105 if (!copy)
2106 return NULL;
2107 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2108
Victor Stinner87af4f22011-11-21 23:03:47 +01002109 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2110 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002111 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002112 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002113}
2114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002115
Victor Stinnerbc603d12011-10-02 01:00:40 +02002116/* Widen Unicode objects to larger buffers. Don't write terminating null
2117 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002118
2119void*
2120_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2121{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002122 Py_ssize_t len;
2123 void *result;
2124 unsigned int skind;
2125
Benjamin Petersonbac79492012-01-14 13:34:47 -05002126 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002127 return NULL;
2128
2129 len = PyUnicode_GET_LENGTH(s);
2130 skind = PyUnicode_KIND(s);
2131 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002132 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002133 return NULL;
2134 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002135 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002136 case PyUnicode_2BYTE_KIND:
2137 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2138 if (!result)
2139 return PyErr_NoMemory();
2140 assert(skind == PyUnicode_1BYTE_KIND);
2141 _PyUnicode_CONVERT_BYTES(
2142 Py_UCS1, Py_UCS2,
2143 PyUnicode_1BYTE_DATA(s),
2144 PyUnicode_1BYTE_DATA(s) + len,
2145 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002147 case PyUnicode_4BYTE_KIND:
2148 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2149 if (!result)
2150 return PyErr_NoMemory();
2151 if (skind == PyUnicode_2BYTE_KIND) {
2152 _PyUnicode_CONVERT_BYTES(
2153 Py_UCS2, Py_UCS4,
2154 PyUnicode_2BYTE_DATA(s),
2155 PyUnicode_2BYTE_DATA(s) + len,
2156 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002157 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002158 else {
2159 assert(skind == PyUnicode_1BYTE_KIND);
2160 _PyUnicode_CONVERT_BYTES(
2161 Py_UCS1, Py_UCS4,
2162 PyUnicode_1BYTE_DATA(s),
2163 PyUnicode_1BYTE_DATA(s) + len,
2164 result);
2165 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002167 default:
2168 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169 }
Victor Stinner01698042011-10-04 00:04:26 +02002170 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002171 return NULL;
2172}
2173
2174static Py_UCS4*
2175as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2176 int copy_null)
2177{
2178 int kind;
2179 void *data;
2180 Py_ssize_t len, targetlen;
2181 if (PyUnicode_READY(string) == -1)
2182 return NULL;
2183 kind = PyUnicode_KIND(string);
2184 data = PyUnicode_DATA(string);
2185 len = PyUnicode_GET_LENGTH(string);
2186 targetlen = len;
2187 if (copy_null)
2188 targetlen++;
2189 if (!target) {
2190 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2191 PyErr_NoMemory();
2192 return NULL;
2193 }
2194 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2195 if (!target) {
2196 PyErr_NoMemory();
2197 return NULL;
2198 }
2199 }
2200 else {
2201 if (targetsize < targetlen) {
2202 PyErr_Format(PyExc_SystemError,
2203 "string is longer than the buffer");
2204 if (copy_null && 0 < targetsize)
2205 target[0] = 0;
2206 return NULL;
2207 }
2208 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002209 if (kind == PyUnicode_1BYTE_KIND) {
2210 Py_UCS1 *start = (Py_UCS1 *) data;
2211 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002213 else if (kind == PyUnicode_2BYTE_KIND) {
2214 Py_UCS2 *start = (Py_UCS2 *) data;
2215 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2216 }
2217 else {
2218 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002220 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 if (copy_null)
2222 target[len] = 0;
2223 return target;
2224}
2225
2226Py_UCS4*
2227PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2228 int copy_null)
2229{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002230 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 PyErr_BadInternalCall();
2232 return NULL;
2233 }
2234 return as_ucs4(string, target, targetsize, copy_null);
2235}
2236
2237Py_UCS4*
2238PyUnicode_AsUCS4Copy(PyObject *string)
2239{
2240 return as_ucs4(string, NULL, 0, 1);
2241}
2242
2243#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002244
Alexander Belopolsky40018472011-02-26 01:02:56 +00002245PyObject *
2246PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002249 if (size == 0) {
2250 Py_INCREF(unicode_empty);
2251 return unicode_empty;
2252 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002253 PyErr_BadInternalCall();
2254 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002255 }
2256
Martin v. Löwis790465f2008-04-05 20:41:37 +00002257 if (size == -1) {
2258 size = wcslen(w);
2259 }
2260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262}
2263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002265
Walter Dörwald346737f2007-05-31 10:44:43 +00002266static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002267makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2268 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002269{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002270 *fmt++ = '%';
2271 if (width) {
2272 if (zeropad)
2273 *fmt++ = '0';
2274 fmt += sprintf(fmt, "%d", width);
2275 }
2276 if (precision)
2277 fmt += sprintf(fmt, ".%d", precision);
2278 if (longflag)
2279 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002280 else if (longlongflag) {
2281 /* longlongflag should only ever be nonzero on machines with
2282 HAVE_LONG_LONG defined */
2283#ifdef HAVE_LONG_LONG
2284 char *f = PY_FORMAT_LONG_LONG;
2285 while (*f)
2286 *fmt++ = *f++;
2287#else
2288 /* we shouldn't ever get here */
2289 assert(0);
2290 *fmt++ = 'l';
2291#endif
2292 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002293 else if (size_tflag) {
2294 char *f = PY_FORMAT_SIZE_T;
2295 while (*f)
2296 *fmt++ = *f++;
2297 }
2298 *fmt++ = c;
2299 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002300}
2301
Victor Stinner96865452011-03-01 23:44:09 +00002302/* helper for PyUnicode_FromFormatV() */
2303
2304static const char*
2305parse_format_flags(const char *f,
2306 int *p_width, int *p_precision,
2307 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2308{
2309 int width, precision, longflag, longlongflag, size_tflag;
2310
2311 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2312 f++;
2313 width = 0;
2314 while (Py_ISDIGIT((unsigned)*f))
2315 width = (width*10) + *f++ - '0';
2316 precision = 0;
2317 if (*f == '.') {
2318 f++;
2319 while (Py_ISDIGIT((unsigned)*f))
2320 precision = (precision*10) + *f++ - '0';
2321 if (*f == '%') {
2322 /* "%.3%s" => f points to "3" */
2323 f--;
2324 }
2325 }
2326 if (*f == '\0') {
2327 /* bogus format "%.1" => go backward, f points to "1" */
2328 f--;
2329 }
2330 if (p_width != NULL)
2331 *p_width = width;
2332 if (p_precision != NULL)
2333 *p_precision = precision;
2334
2335 /* Handle %ld, %lu, %lld and %llu. */
2336 longflag = 0;
2337 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002338 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002339
2340 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002341 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002342 longflag = 1;
2343 ++f;
2344 }
2345#ifdef HAVE_LONG_LONG
2346 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002347 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002348 longlongflag = 1;
2349 f += 2;
2350 }
2351#endif
2352 }
2353 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002354 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002355 size_tflag = 1;
2356 ++f;
2357 }
2358 if (p_longflag != NULL)
2359 *p_longflag = longflag;
2360 if (p_longlongflag != NULL)
2361 *p_longlongflag = longlongflag;
2362 if (p_size_tflag != NULL)
2363 *p_size_tflag = size_tflag;
2364 return f;
2365}
2366
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002367/* maximum number of characters required for output of %ld. 21 characters
2368 allows for 64-bit integers (in decimal) and an optional sign. */
2369#define MAX_LONG_CHARS 21
2370/* maximum number of characters required for output of %lld.
2371 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2372 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2373#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2374
Walter Dörwaldd2034312007-05-18 16:29:38 +00002375PyObject *
2376PyUnicode_FromFormatV(const char *format, va_list vargs)
2377{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002378 va_list count;
2379 Py_ssize_t callcount = 0;
2380 PyObject **callresults = NULL;
2381 PyObject **callresult = NULL;
2382 Py_ssize_t n = 0;
2383 int width = 0;
2384 int precision = 0;
2385 int zeropad;
2386 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002387 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002388 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002389 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002390 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2391 Py_UCS4 argmaxchar;
2392 Py_ssize_t numbersize = 0;
2393 char *numberresults = NULL;
2394 char *numberresult = NULL;
2395 Py_ssize_t i;
2396 int kind;
2397 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002398
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002399 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002400 /* step 1: count the number of %S/%R/%A/%s format specifications
2401 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2402 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002404 * also estimate a upper bound for all the number formats in the string,
2405 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002407 for (f = format; *f; f++) {
2408 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002409 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002410 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2411 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2412 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2413 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002416#ifdef HAVE_LONG_LONG
2417 if (longlongflag) {
2418 if (width < MAX_LONG_LONG_CHARS)
2419 width = MAX_LONG_LONG_CHARS;
2420 }
2421 else
2422#endif
2423 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2424 including sign. Decimal takes the most space. This
2425 isn't enough for octal. If a width is specified we
2426 need more (which we allocate later). */
2427 if (width < MAX_LONG_CHARS)
2428 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002429
2430 /* account for the size + '\0' to separate numbers
2431 inside of the numberresults buffer */
2432 numbersize += (width + 1);
2433 }
2434 }
2435 else if ((unsigned char)*f > 127) {
2436 PyErr_Format(PyExc_ValueError,
2437 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2438 "string, got a non-ASCII byte: 0x%02x",
2439 (unsigned char)*f);
2440 return NULL;
2441 }
2442 }
2443 /* step 2: allocate memory for the results of
2444 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2445 if (callcount) {
2446 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2447 if (!callresults) {
2448 PyErr_NoMemory();
2449 return NULL;
2450 }
2451 callresult = callresults;
2452 }
2453 /* step 2.5: allocate memory for the results of formating numbers */
2454 if (numbersize) {
2455 numberresults = PyObject_Malloc(numbersize);
2456 if (!numberresults) {
2457 PyErr_NoMemory();
2458 goto fail;
2459 }
2460 numberresult = numberresults;
2461 }
2462
2463 /* step 3: format numbers and figure out how large a buffer we need */
2464 for (f = format; *f; f++) {
2465 if (*f == '%') {
2466 const char* p;
2467 int longflag;
2468 int longlongflag;
2469 int size_tflag;
2470 int numprinted;
2471
2472 p = f;
2473 zeropad = (f[1] == '0');
2474 f = parse_format_flags(f, &width, &precision,
2475 &longflag, &longlongflag, &size_tflag);
2476 switch (*f) {
2477 case 'c':
2478 {
2479 Py_UCS4 ordinal = va_arg(count, int);
Victor Stinnere6abb482012-05-02 01:15:40 +02002480 maxchar = MAX_MAXCHAR(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002481 n++;
2482 break;
2483 }
2484 case '%':
2485 n++;
2486 break;
2487 case 'i':
2488 case 'd':
2489 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2490 width, precision, *f);
2491 if (longflag)
2492 numprinted = sprintf(numberresult, fmt,
2493 va_arg(count, long));
2494#ifdef HAVE_LONG_LONG
2495 else if (longlongflag)
2496 numprinted = sprintf(numberresult, fmt,
2497 va_arg(count, PY_LONG_LONG));
2498#endif
2499 else if (size_tflag)
2500 numprinted = sprintf(numberresult, fmt,
2501 va_arg(count, Py_ssize_t));
2502 else
2503 numprinted = sprintf(numberresult, fmt,
2504 va_arg(count, int));
2505 n += numprinted;
2506 /* advance by +1 to skip over the '\0' */
2507 numberresult += (numprinted + 1);
2508 assert(*(numberresult - 1) == '\0');
2509 assert(*(numberresult - 2) != '\0');
2510 assert(numprinted >= 0);
2511 assert(numberresult <= numberresults + numbersize);
2512 break;
2513 case 'u':
2514 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2515 width, precision, 'u');
2516 if (longflag)
2517 numprinted = sprintf(numberresult, fmt,
2518 va_arg(count, unsigned long));
2519#ifdef HAVE_LONG_LONG
2520 else if (longlongflag)
2521 numprinted = sprintf(numberresult, fmt,
2522 va_arg(count, unsigned PY_LONG_LONG));
2523#endif
2524 else if (size_tflag)
2525 numprinted = sprintf(numberresult, fmt,
2526 va_arg(count, size_t));
2527 else
2528 numprinted = sprintf(numberresult, fmt,
2529 va_arg(count, unsigned int));
2530 n += numprinted;
2531 numberresult += (numprinted + 1);
2532 assert(*(numberresult - 1) == '\0');
2533 assert(*(numberresult - 2) != '\0');
2534 assert(numprinted >= 0);
2535 assert(numberresult <= numberresults + numbersize);
2536 break;
2537 case 'x':
2538 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2539 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2540 n += numprinted;
2541 numberresult += (numprinted + 1);
2542 assert(*(numberresult - 1) == '\0');
2543 assert(*(numberresult - 2) != '\0');
2544 assert(numprinted >= 0);
2545 assert(numberresult <= numberresults + numbersize);
2546 break;
2547 case 'p':
2548 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2549 /* %p is ill-defined: ensure leading 0x. */
2550 if (numberresult[1] == 'X')
2551 numberresult[1] = 'x';
2552 else if (numberresult[1] != 'x') {
2553 memmove(numberresult + 2, numberresult,
2554 strlen(numberresult) + 1);
2555 numberresult[0] = '0';
2556 numberresult[1] = 'x';
2557 numprinted += 2;
2558 }
2559 n += numprinted;
2560 numberresult += (numprinted + 1);
2561 assert(*(numberresult - 1) == '\0');
2562 assert(*(numberresult - 2) != '\0');
2563 assert(numprinted >= 0);
2564 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002565 break;
2566 case 's':
2567 {
2568 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002569 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002570 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002571 if (!str)
2572 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002573 /* since PyUnicode_DecodeUTF8 returns already flexible
2574 unicode objects, there is no need to call ready on them */
2575 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002576 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002577 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002578 /* Remember the str and switch to the next slot */
2579 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002580 break;
2581 }
2582 case 'U':
2583 {
2584 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002585 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002586 if (PyUnicode_READY(obj) == -1)
2587 goto fail;
2588 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002589 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002590 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 break;
2592 }
2593 case 'V':
2594 {
2595 PyObject *obj = va_arg(count, PyObject *);
2596 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002597 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002598 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002599 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002600 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002601 if (PyUnicode_READY(obj) == -1)
2602 goto fail;
2603 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002604 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002605 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002606 *callresult++ = NULL;
2607 }
2608 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002609 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002610 if (!str_obj)
2611 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002612 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002613 Py_DECREF(str_obj);
2614 goto fail;
2615 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002616 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002617 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002618 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002619 *callresult++ = str_obj;
2620 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002621 break;
2622 }
2623 case 'S':
2624 {
2625 PyObject *obj = va_arg(count, PyObject *);
2626 PyObject *str;
2627 assert(obj);
2628 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002629 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002630 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002631 if (PyUnicode_READY(str) == -1) {
2632 Py_DECREF(str);
2633 goto fail;
2634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002636 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 /* Remember the str and switch to the next slot */
2639 *callresult++ = str;
2640 break;
2641 }
2642 case 'R':
2643 {
2644 PyObject *obj = va_arg(count, PyObject *);
2645 PyObject *repr;
2646 assert(obj);
2647 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002648 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002649 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002650 if (PyUnicode_READY(repr) == -1) {
2651 Py_DECREF(repr);
2652 goto fail;
2653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002654 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Victor Stinnere6abb482012-05-02 01:15:40 +02002655 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002657 /* Remember the repr and switch to the next slot */
2658 *callresult++ = repr;
2659 break;
2660 }
2661 case 'A':
2662 {
2663 PyObject *obj = va_arg(count, PyObject *);
2664 PyObject *ascii;
2665 assert(obj);
2666 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002667 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002669 if (PyUnicode_READY(ascii) == -1) {
2670 Py_DECREF(ascii);
2671 goto fail;
2672 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Victor Stinnere6abb482012-05-02 01:15:40 +02002674 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002675 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002676 /* Remember the repr and switch to the next slot */
2677 *callresult++ = ascii;
2678 break;
2679 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002680 default:
2681 /* if we stumble upon an unknown
2682 formatting code, copy the rest of
2683 the format string to the output
2684 string. (we cannot just skip the
2685 code, since there's no way to know
2686 what's in the argument list) */
2687 n += strlen(p);
2688 goto expand;
2689 }
2690 } else
2691 n++;
2692 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002693 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002695 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002696 we don't have to resize the string.
2697 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002698 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002699 if (!string)
2700 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002701 kind = PyUnicode_KIND(string);
2702 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002703 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002704 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002705
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002706 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002707 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002708 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002709
2710 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2712 /* checking for == because the last argument could be a empty
2713 string, which causes i to point to end, the assert at the end of
2714 the loop */
2715 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002716
Benjamin Peterson14339b62009-01-31 16:36:08 +00002717 switch (*f) {
2718 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002719 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002720 const int ordinal = va_arg(vargs, int);
2721 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002722 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002723 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002724 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002725 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002726 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002727 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002728 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002729 {
Victor Stinner184252a2012-06-16 02:57:41 +02002730 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002731 /* unused, since we already have the result */
2732 if (*f == 'p')
2733 (void) va_arg(vargs, void *);
2734 else
2735 (void) va_arg(vargs, int);
2736 /* extract the result from numberresults and append. */
Victor Stinner184252a2012-06-16 02:57:41 +02002737 len = strlen(numberresult);
2738 unicode_write_cstr(string, i, numberresult, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 /* skip over the separating '\0' */
Victor Stinner184252a2012-06-16 02:57:41 +02002740 i += len;
2741 numberresult += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002742 assert(*numberresult == '\0');
2743 numberresult++;
2744 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002745 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002746 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002747 case 's':
2748 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002749 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002750 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002751 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002752 size = PyUnicode_GET_LENGTH(*callresult);
2753 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002754 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002755 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002756 /* We're done with the unicode()/repr() => forget it */
2757 Py_DECREF(*callresult);
2758 /* switch to next unicode()/repr() result */
2759 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002760 break;
2761 }
2762 case 'U':
2763 {
2764 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002765 Py_ssize_t size;
2766 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2767 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerd3f08822012-05-29 12:57:52 +02002768 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002769 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002770 break;
2771 }
2772 case 'V':
2773 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002774 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002775 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002776 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002777 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002778 size = PyUnicode_GET_LENGTH(obj);
2779 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002780 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002781 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002782 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002783 size = PyUnicode_GET_LENGTH(*callresult);
2784 assert(PyUnicode_KIND(*callresult) <=
2785 PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002786 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002787 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002788 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002789 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002790 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002791 break;
2792 }
2793 case 'S':
2794 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002795 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002796 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002797 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002798 /* unused, since we already have the result */
2799 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002800 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002801 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002802 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002803 /* We're done with the unicode()/repr() => forget it */
2804 Py_DECREF(*callresult);
2805 /* switch to next unicode()/repr() result */
2806 ++callresult;
2807 break;
2808 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002809 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002810 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002811 break;
2812 default:
Victor Stinner184252a2012-06-16 02:57:41 +02002813 {
2814 Py_ssize_t len = strlen(p);
2815 unicode_write_cstr(string, i, p, len);
2816 i += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002817 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002818 goto end;
2819 }
Victor Stinner184252a2012-06-16 02:57:41 +02002820 }
Victor Stinner1205f272010-09-11 00:54:47 +00002821 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002822 else {
2823 assert(i < PyUnicode_GET_LENGTH(string));
2824 PyUnicode_WRITE(kind, data, i++, *f);
2825 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002826 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002827 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002828
Benjamin Peterson29060642009-01-31 22:14:21 +00002829 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002830 if (callresults)
2831 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002832 if (numberresults)
2833 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002834 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002835 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002836 if (callresults) {
2837 PyObject **callresult2 = callresults;
2838 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002839 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002840 ++callresult2;
2841 }
2842 PyObject_Free(callresults);
2843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002844 if (numberresults)
2845 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002846 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002847}
2848
Walter Dörwaldd2034312007-05-18 16:29:38 +00002849PyObject *
2850PyUnicode_FromFormat(const char *format, ...)
2851{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002852 PyObject* ret;
2853 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002854
2855#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002856 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002857#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002858 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002859#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002860 ret = PyUnicode_FromFormatV(format, vargs);
2861 va_end(vargs);
2862 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002863}
2864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002865#ifdef HAVE_WCHAR_H
2866
Victor Stinner5593d8a2010-10-02 11:11:27 +00002867/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2868 convert a Unicode object to a wide character string.
2869
Victor Stinnerd88d9832011-09-06 02:00:05 +02002870 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002871 character) required to convert the unicode object. Ignore size argument.
2872
Victor Stinnerd88d9832011-09-06 02:00:05 +02002873 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002874 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002875 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002876static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002877unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002878 wchar_t *w,
2879 Py_ssize_t size)
2880{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002881 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002882 const wchar_t *wstr;
2883
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002884 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002885 if (wstr == NULL)
2886 return -1;
2887
Victor Stinner5593d8a2010-10-02 11:11:27 +00002888 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002889 if (size > res)
2890 size = res + 1;
2891 else
2892 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002893 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002894 return res;
2895 }
2896 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002897 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002898}
2899
2900Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002901PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002902 wchar_t *w,
2903 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904{
2905 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002906 PyErr_BadInternalCall();
2907 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002908 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002909 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910}
2911
Victor Stinner137c34c2010-09-29 10:25:54 +00002912wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002913PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002914 Py_ssize_t *size)
2915{
2916 wchar_t* buffer;
2917 Py_ssize_t buflen;
2918
2919 if (unicode == NULL) {
2920 PyErr_BadInternalCall();
2921 return NULL;
2922 }
2923
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002924 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002925 if (buflen == -1)
2926 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002927 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002928 PyErr_NoMemory();
2929 return NULL;
2930 }
2931
Victor Stinner137c34c2010-09-29 10:25:54 +00002932 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2933 if (buffer == NULL) {
2934 PyErr_NoMemory();
2935 return NULL;
2936 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002937 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002938 if (buflen == -1) {
2939 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002940 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002941 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002942 if (size != NULL)
2943 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002944 return buffer;
2945}
2946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002947#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002948
Alexander Belopolsky40018472011-02-26 01:02:56 +00002949PyObject *
2950PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002951{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002952 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002953 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002954 PyErr_SetString(PyExc_ValueError,
2955 "chr() arg not in range(0x110000)");
2956 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002957 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002959 if (ordinal < 256)
2960 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002962 v = PyUnicode_New(1, ordinal);
2963 if (v == NULL)
2964 return NULL;
2965 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002966 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002967 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002968}
2969
Alexander Belopolsky40018472011-02-26 01:02:56 +00002970PyObject *
2971PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002973 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002974 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002975 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002976 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002977 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 Py_INCREF(obj);
2979 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002980 }
2981 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002982 /* For a Unicode subtype that's not a Unicode object,
2983 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002984 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002985 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002986 PyErr_Format(PyExc_TypeError,
2987 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002988 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002989 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002990}
2991
Alexander Belopolsky40018472011-02-26 01:02:56 +00002992PyObject *
2993PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002994 const char *encoding,
2995 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002996{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002997 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002998 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002999
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003001 PyErr_BadInternalCall();
3002 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003004
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003005 /* Decoding bytes objects is the most common case and should be fast */
3006 if (PyBytes_Check(obj)) {
3007 if (PyBytes_GET_SIZE(obj) == 0) {
3008 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003009 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003010 }
3011 else {
3012 v = PyUnicode_Decode(
3013 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3014 encoding, errors);
3015 }
3016 return v;
3017 }
3018
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003019 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003020 PyErr_SetString(PyExc_TypeError,
3021 "decoding str is not supported");
3022 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003023 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003024
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003025 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3026 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3027 PyErr_Format(PyExc_TypeError,
3028 "coercing to str: need bytes, bytearray "
3029 "or buffer-like object, %.80s found",
3030 Py_TYPE(obj)->tp_name);
3031 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003032 }
Tim Petersced69f82003-09-16 20:30:58 +00003033
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003034 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003035 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02003036 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 }
Tim Petersced69f82003-09-16 20:30:58 +00003038 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003039 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003040
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003041 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003042 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003043}
3044
Victor Stinner600d3be2010-06-10 12:00:55 +00003045/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003046 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3047 1 on success. */
3048static int
3049normalize_encoding(const char *encoding,
3050 char *lower,
3051 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003053 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003054 char *l;
3055 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003056
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003057 if (encoding == NULL) {
3058 strcpy(lower, "utf-8");
3059 return 1;
3060 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003061 e = encoding;
3062 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003063 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003064 while (*e) {
3065 if (l == l_end)
3066 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003067 if (Py_ISUPPER(*e)) {
3068 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003069 }
3070 else if (*e == '_') {
3071 *l++ = '-';
3072 e++;
3073 }
3074 else {
3075 *l++ = *e++;
3076 }
3077 }
3078 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003079 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003080}
3081
Alexander Belopolsky40018472011-02-26 01:02:56 +00003082PyObject *
3083PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003084 Py_ssize_t size,
3085 const char *encoding,
3086 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003087{
3088 PyObject *buffer = NULL, *unicode;
3089 Py_buffer info;
3090 char lower[11]; /* Enough for any encoding shortcut */
3091
Fred Drakee4315f52000-05-09 19:53:39 +00003092 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003093 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003094 if ((strcmp(lower, "utf-8") == 0) ||
3095 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003096 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003097 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003098 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003099 (strcmp(lower, "iso-8859-1") == 0))
3100 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003101#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003102 else if (strcmp(lower, "mbcs") == 0)
3103 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003104#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003105 else if (strcmp(lower, "ascii") == 0)
3106 return PyUnicode_DecodeASCII(s, size, errors);
3107 else if (strcmp(lower, "utf-16") == 0)
3108 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3109 else if (strcmp(lower, "utf-32") == 0)
3110 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3111 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003112
3113 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003114 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003115 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003116 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003117 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118 if (buffer == NULL)
3119 goto onError;
3120 unicode = PyCodec_Decode(buffer, encoding, errors);
3121 if (unicode == NULL)
3122 goto onError;
3123 if (!PyUnicode_Check(unicode)) {
3124 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003125 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003126 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 Py_DECREF(unicode);
3128 goto onError;
3129 }
3130 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003131 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003132
Benjamin Peterson29060642009-01-31 22:14:21 +00003133 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134 Py_XDECREF(buffer);
3135 return NULL;
3136}
3137
Alexander Belopolsky40018472011-02-26 01:02:56 +00003138PyObject *
3139PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003140 const char *encoding,
3141 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003142{
3143 PyObject *v;
3144
3145 if (!PyUnicode_Check(unicode)) {
3146 PyErr_BadArgument();
3147 goto onError;
3148 }
3149
3150 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003151 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003152
3153 /* Decode via the codec registry */
3154 v = PyCodec_Decode(unicode, encoding, errors);
3155 if (v == NULL)
3156 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003157 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003158
Benjamin Peterson29060642009-01-31 22:14:21 +00003159 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003160 return NULL;
3161}
3162
Alexander Belopolsky40018472011-02-26 01:02:56 +00003163PyObject *
3164PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003165 const char *encoding,
3166 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003167{
3168 PyObject *v;
3169
3170 if (!PyUnicode_Check(unicode)) {
3171 PyErr_BadArgument();
3172 goto onError;
3173 }
3174
3175 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003176 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003177
3178 /* Decode via the codec registry */
3179 v = PyCodec_Decode(unicode, encoding, errors);
3180 if (v == NULL)
3181 goto onError;
3182 if (!PyUnicode_Check(v)) {
3183 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003184 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003185 Py_TYPE(v)->tp_name);
3186 Py_DECREF(v);
3187 goto onError;
3188 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003189 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003190
Benjamin Peterson29060642009-01-31 22:14:21 +00003191 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003192 return NULL;
3193}
3194
Alexander Belopolsky40018472011-02-26 01:02:56 +00003195PyObject *
3196PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003197 Py_ssize_t size,
3198 const char *encoding,
3199 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200{
3201 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003202
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203 unicode = PyUnicode_FromUnicode(s, size);
3204 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003205 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3207 Py_DECREF(unicode);
3208 return v;
3209}
3210
Alexander Belopolsky40018472011-02-26 01:02:56 +00003211PyObject *
3212PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003213 const char *encoding,
3214 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003215{
3216 PyObject *v;
3217
3218 if (!PyUnicode_Check(unicode)) {
3219 PyErr_BadArgument();
3220 goto onError;
3221 }
3222
3223 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003224 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003225
3226 /* Encode via the codec registry */
3227 v = PyCodec_Encode(unicode, encoding, errors);
3228 if (v == NULL)
3229 goto onError;
3230 return v;
3231
Benjamin Peterson29060642009-01-31 22:14:21 +00003232 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003233 return NULL;
3234}
3235
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003236static size_t
3237wcstombs_errorpos(const wchar_t *wstr)
3238{
3239 size_t len;
3240#if SIZEOF_WCHAR_T == 2
3241 wchar_t buf[3];
3242#else
3243 wchar_t buf[2];
3244#endif
3245 char outbuf[MB_LEN_MAX];
3246 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003247
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003248#if SIZEOF_WCHAR_T == 2
3249 buf[2] = 0;
3250#else
3251 buf[1] = 0;
3252#endif
3253 start = wstr;
3254 while (*wstr != L'\0')
3255 {
3256 previous = wstr;
3257#if SIZEOF_WCHAR_T == 2
3258 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3259 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3260 {
3261 buf[0] = wstr[0];
3262 buf[1] = wstr[1];
3263 wstr += 2;
3264 }
3265 else {
3266 buf[0] = *wstr;
3267 buf[1] = 0;
3268 wstr++;
3269 }
3270#else
3271 buf[0] = *wstr;
3272 wstr++;
3273#endif
3274 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003275 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003276 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003277 }
3278
3279 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003280 return 0;
3281}
3282
Victor Stinner1b579672011-12-17 05:47:23 +01003283static int
3284locale_error_handler(const char *errors, int *surrogateescape)
3285{
3286 if (errors == NULL) {
3287 *surrogateescape = 0;
3288 return 0;
3289 }
3290
3291 if (strcmp(errors, "strict") == 0) {
3292 *surrogateescape = 0;
3293 return 0;
3294 }
3295 if (strcmp(errors, "surrogateescape") == 0) {
3296 *surrogateescape = 1;
3297 return 0;
3298 }
3299 PyErr_Format(PyExc_ValueError,
3300 "only 'strict' and 'surrogateescape' error handlers "
3301 "are supported, not '%s'",
3302 errors);
3303 return -1;
3304}
3305
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003306PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003307PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003308{
3309 Py_ssize_t wlen, wlen2;
3310 wchar_t *wstr;
3311 PyObject *bytes = NULL;
3312 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003313 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003314 PyObject *exc;
3315 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003316 int surrogateescape;
3317
3318 if (locale_error_handler(errors, &surrogateescape) < 0)
3319 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003320
3321 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3322 if (wstr == NULL)
3323 return NULL;
3324
3325 wlen2 = wcslen(wstr);
3326 if (wlen2 != wlen) {
3327 PyMem_Free(wstr);
3328 PyErr_SetString(PyExc_TypeError, "embedded null character");
3329 return NULL;
3330 }
3331
3332 if (surrogateescape) {
3333 /* locale encoding with surrogateescape */
3334 char *str;
3335
3336 str = _Py_wchar2char(wstr, &error_pos);
3337 if (str == NULL) {
3338 if (error_pos == (size_t)-1) {
3339 PyErr_NoMemory();
3340 PyMem_Free(wstr);
3341 return NULL;
3342 }
3343 else {
3344 goto encode_error;
3345 }
3346 }
3347 PyMem_Free(wstr);
3348
3349 bytes = PyBytes_FromString(str);
3350 PyMem_Free(str);
3351 }
3352 else {
3353 size_t len, len2;
3354
3355 len = wcstombs(NULL, wstr, 0);
3356 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003357 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003358 goto encode_error;
3359 }
3360
3361 bytes = PyBytes_FromStringAndSize(NULL, len);
3362 if (bytes == NULL) {
3363 PyMem_Free(wstr);
3364 return NULL;
3365 }
3366
3367 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3368 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003369 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003370 goto encode_error;
3371 }
3372 PyMem_Free(wstr);
3373 }
3374 return bytes;
3375
3376encode_error:
3377 errmsg = strerror(errno);
3378 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003379
3380 if (error_pos == (size_t)-1)
3381 error_pos = wcstombs_errorpos(wstr);
3382
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003383 PyMem_Free(wstr);
3384 Py_XDECREF(bytes);
3385
Victor Stinner2f197072011-12-17 07:08:30 +01003386 if (errmsg != NULL) {
3387 size_t errlen;
3388 wstr = _Py_char2wchar(errmsg, &errlen);
3389 if (wstr != NULL) {
3390 reason = PyUnicode_FromWideChar(wstr, errlen);
3391 PyMem_Free(wstr);
3392 } else
3393 errmsg = NULL;
3394 }
3395 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003396 reason = PyUnicode_FromString(
3397 "wcstombs() encountered an unencodable "
3398 "wide character");
3399 if (reason == NULL)
3400 return NULL;
3401
3402 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3403 "locale", unicode,
3404 (Py_ssize_t)error_pos,
3405 (Py_ssize_t)(error_pos+1),
3406 reason);
3407 Py_DECREF(reason);
3408 if (exc != NULL) {
3409 PyCodec_StrictErrors(exc);
3410 Py_XDECREF(exc);
3411 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003412 return NULL;
3413}
3414
Victor Stinnerad158722010-10-27 00:25:46 +00003415PyObject *
3416PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003417{
Victor Stinner99b95382011-07-04 14:23:54 +02003418#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003419 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003420#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003421 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003422#else
Victor Stinner793b5312011-04-27 00:24:21 +02003423 PyInterpreterState *interp = PyThreadState_GET()->interp;
3424 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3425 cannot use it to encode and decode filenames before it is loaded. Load
3426 the Python codec requires to encode at least its own filename. Use the C
3427 version of the locale codec until the codec registry is initialized and
3428 the Python codec is loaded.
3429
3430 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3431 cannot only rely on it: check also interp->fscodec_initialized for
3432 subinterpreters. */
3433 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003434 return PyUnicode_AsEncodedString(unicode,
3435 Py_FileSystemDefaultEncoding,
3436 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003437 }
3438 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003439 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003440 }
Victor Stinnerad158722010-10-27 00:25:46 +00003441#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003442}
3443
Alexander Belopolsky40018472011-02-26 01:02:56 +00003444PyObject *
3445PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003446 const char *encoding,
3447 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448{
3449 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003450 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003451
Guido van Rossumd57fd912000-03-10 22:53:23 +00003452 if (!PyUnicode_Check(unicode)) {
3453 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003454 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003455 }
Fred Drakee4315f52000-05-09 19:53:39 +00003456
Fred Drakee4315f52000-05-09 19:53:39 +00003457 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003458 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003459 if ((strcmp(lower, "utf-8") == 0) ||
3460 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003461 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003462 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003463 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003464 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003465 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003466 }
Victor Stinner37296e82010-06-10 13:36:23 +00003467 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003468 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003469 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003470 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003471#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003472 else if (strcmp(lower, "mbcs") == 0)
3473 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003474#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003475 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003476 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003477 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478
3479 /* Encode via the codec registry */
3480 v = PyCodec_Encode(unicode, encoding, errors);
3481 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003482 return NULL;
3483
3484 /* The normal path */
3485 if (PyBytes_Check(v))
3486 return v;
3487
3488 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003489 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003490 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003491 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003492
3493 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3494 "encoder %s returned bytearray instead of bytes",
3495 encoding);
3496 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003497 Py_DECREF(v);
3498 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003499 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003500
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003501 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3502 Py_DECREF(v);
3503 return b;
3504 }
3505
3506 PyErr_Format(PyExc_TypeError,
3507 "encoder did not return a bytes object (type=%.400s)",
3508 Py_TYPE(v)->tp_name);
3509 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003510 return NULL;
3511}
3512
Alexander Belopolsky40018472011-02-26 01:02:56 +00003513PyObject *
3514PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003515 const char *encoding,
3516 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003517{
3518 PyObject *v;
3519
3520 if (!PyUnicode_Check(unicode)) {
3521 PyErr_BadArgument();
3522 goto onError;
3523 }
3524
3525 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003526 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003527
3528 /* Encode via the codec registry */
3529 v = PyCodec_Encode(unicode, encoding, errors);
3530 if (v == NULL)
3531 goto onError;
3532 if (!PyUnicode_Check(v)) {
3533 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003534 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003535 Py_TYPE(v)->tp_name);
3536 Py_DECREF(v);
3537 goto onError;
3538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003540
Benjamin Peterson29060642009-01-31 22:14:21 +00003541 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542 return NULL;
3543}
3544
Victor Stinner2f197072011-12-17 07:08:30 +01003545static size_t
3546mbstowcs_errorpos(const char *str, size_t len)
3547{
3548#ifdef HAVE_MBRTOWC
3549 const char *start = str;
3550 mbstate_t mbs;
3551 size_t converted;
3552 wchar_t ch;
3553
3554 memset(&mbs, 0, sizeof mbs);
3555 while (len)
3556 {
3557 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3558 if (converted == 0)
3559 /* Reached end of string */
3560 break;
3561 if (converted == (size_t)-1 || converted == (size_t)-2) {
3562 /* Conversion error or incomplete character */
3563 return str - start;
3564 }
3565 else {
3566 str += converted;
3567 len -= converted;
3568 }
3569 }
3570 /* failed to find the undecodable byte sequence */
3571 return 0;
3572#endif
3573 return 0;
3574}
3575
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003576PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003577PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003578 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003579{
3580 wchar_t smallbuf[256];
3581 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3582 wchar_t *wstr;
3583 size_t wlen, wlen2;
3584 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003585 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003586 size_t error_pos;
3587 char *errmsg;
3588 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003589
3590 if (locale_error_handler(errors, &surrogateescape) < 0)
3591 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003592
3593 if (str[len] != '\0' || len != strlen(str)) {
3594 PyErr_SetString(PyExc_TypeError, "embedded null character");
3595 return NULL;
3596 }
3597
3598 if (surrogateescape)
3599 {
3600 wstr = _Py_char2wchar(str, &wlen);
3601 if (wstr == NULL) {
3602 if (wlen == (size_t)-1)
3603 PyErr_NoMemory();
3604 else
3605 PyErr_SetFromErrno(PyExc_OSError);
3606 return NULL;
3607 }
3608
3609 unicode = PyUnicode_FromWideChar(wstr, wlen);
3610 PyMem_Free(wstr);
3611 }
3612 else {
3613#ifndef HAVE_BROKEN_MBSTOWCS
3614 wlen = mbstowcs(NULL, str, 0);
3615#else
3616 wlen = len;
3617#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003618 if (wlen == (size_t)-1)
3619 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003620 if (wlen+1 <= smallbuf_len) {
3621 wstr = smallbuf;
3622 }
3623 else {
3624 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3625 return PyErr_NoMemory();
3626
3627 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3628 if (!wstr)
3629 return PyErr_NoMemory();
3630 }
3631
3632 /* This shouldn't fail now */
3633 wlen2 = mbstowcs(wstr, str, wlen+1);
3634 if (wlen2 == (size_t)-1) {
3635 if (wstr != smallbuf)
3636 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003637 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003638 }
3639#ifdef HAVE_BROKEN_MBSTOWCS
3640 assert(wlen2 == wlen);
3641#endif
3642 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3643 if (wstr != smallbuf)
3644 PyMem_Free(wstr);
3645 }
3646 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003647
3648decode_error:
3649 errmsg = strerror(errno);
3650 assert(errmsg != NULL);
3651
3652 error_pos = mbstowcs_errorpos(str, len);
3653 if (errmsg != NULL) {
3654 size_t errlen;
3655 wstr = _Py_char2wchar(errmsg, &errlen);
3656 if (wstr != NULL) {
3657 reason = PyUnicode_FromWideChar(wstr, errlen);
3658 PyMem_Free(wstr);
3659 } else
3660 errmsg = NULL;
3661 }
3662 if (errmsg == NULL)
3663 reason = PyUnicode_FromString(
3664 "mbstowcs() encountered an invalid multibyte sequence");
3665 if (reason == NULL)
3666 return NULL;
3667
3668 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3669 "locale", str, len,
3670 (Py_ssize_t)error_pos,
3671 (Py_ssize_t)(error_pos+1),
3672 reason);
3673 Py_DECREF(reason);
3674 if (exc != NULL) {
3675 PyCodec_StrictErrors(exc);
3676 Py_XDECREF(exc);
3677 }
3678 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003679}
3680
3681PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003682PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003683{
3684 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003685 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003686}
3687
3688
3689PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003690PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003691 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003692 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3693}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003694
Christian Heimes5894ba72007-11-04 11:43:14 +00003695PyObject*
3696PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3697{
Victor Stinner99b95382011-07-04 14:23:54 +02003698#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003699 return PyUnicode_DecodeMBCS(s, size, NULL);
3700#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003701 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003702#else
Victor Stinner793b5312011-04-27 00:24:21 +02003703 PyInterpreterState *interp = PyThreadState_GET()->interp;
3704 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3705 cannot use it to encode and decode filenames before it is loaded. Load
3706 the Python codec requires to encode at least its own filename. Use the C
3707 version of the locale codec until the codec registry is initialized and
3708 the Python codec is loaded.
3709
3710 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3711 cannot only rely on it: check also interp->fscodec_initialized for
3712 subinterpreters. */
3713 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003714 return PyUnicode_Decode(s, size,
3715 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003716 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003717 }
3718 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003719 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003720 }
Victor Stinnerad158722010-10-27 00:25:46 +00003721#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003722}
3723
Martin v. Löwis011e8422009-05-05 04:43:17 +00003724
3725int
Antoine Pitrou13348842012-01-29 18:36:34 +01003726_PyUnicode_HasNULChars(PyObject* s)
3727{
3728 static PyObject *nul = NULL;
3729
3730 if (nul == NULL)
3731 nul = PyUnicode_FromStringAndSize("\0", 1);
3732 if (nul == NULL)
3733 return -1;
3734 return PyUnicode_Contains(s, nul);
3735}
3736
3737
3738int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003739PyUnicode_FSConverter(PyObject* arg, void* addr)
3740{
3741 PyObject *output = NULL;
3742 Py_ssize_t size;
3743 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003744 if (arg == NULL) {
3745 Py_DECREF(*(PyObject**)addr);
3746 return 1;
3747 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003748 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003749 output = arg;
3750 Py_INCREF(output);
3751 }
3752 else {
3753 arg = PyUnicode_FromObject(arg);
3754 if (!arg)
3755 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003756 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003757 Py_DECREF(arg);
3758 if (!output)
3759 return 0;
3760 if (!PyBytes_Check(output)) {
3761 Py_DECREF(output);
3762 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3763 return 0;
3764 }
3765 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003766 size = PyBytes_GET_SIZE(output);
3767 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003768 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003769 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003770 Py_DECREF(output);
3771 return 0;
3772 }
3773 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003774 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003775}
3776
3777
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003778int
3779PyUnicode_FSDecoder(PyObject* arg, void* addr)
3780{
3781 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003782 if (arg == NULL) {
3783 Py_DECREF(*(PyObject**)addr);
3784 return 1;
3785 }
3786 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003787 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003788 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003789 output = arg;
3790 Py_INCREF(output);
3791 }
3792 else {
3793 arg = PyBytes_FromObject(arg);
3794 if (!arg)
3795 return 0;
3796 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3797 PyBytes_GET_SIZE(arg));
3798 Py_DECREF(arg);
3799 if (!output)
3800 return 0;
3801 if (!PyUnicode_Check(output)) {
3802 Py_DECREF(output);
3803 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3804 return 0;
3805 }
3806 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003807 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003808 Py_DECREF(output);
3809 return 0;
3810 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003811 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003812 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003813 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3814 Py_DECREF(output);
3815 return 0;
3816 }
3817 *(PyObject**)addr = output;
3818 return Py_CLEANUP_SUPPORTED;
3819}
3820
3821
Martin v. Löwis5b222132007-06-10 09:51:05 +00003822char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003823PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003824{
Christian Heimesf3863112007-11-22 07:46:41 +00003825 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003826
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003827 if (!PyUnicode_Check(unicode)) {
3828 PyErr_BadArgument();
3829 return NULL;
3830 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003831 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003832 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003833
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003834 if (PyUnicode_UTF8(unicode) == NULL) {
3835 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3837 if (bytes == NULL)
3838 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003839 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3840 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003841 Py_DECREF(bytes);
3842 return NULL;
3843 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003844 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3845 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3846 PyBytes_AS_STRING(bytes),
3847 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003848 Py_DECREF(bytes);
3849 }
3850
3851 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003852 *psize = PyUnicode_UTF8_LENGTH(unicode);
3853 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003854}
3855
3856char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003858{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003859 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3860}
3861
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003862Py_UNICODE *
3863PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3864{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003865 const unsigned char *one_byte;
3866#if SIZEOF_WCHAR_T == 4
3867 const Py_UCS2 *two_bytes;
3868#else
3869 const Py_UCS4 *four_bytes;
3870 const Py_UCS4 *ucs4_end;
3871 Py_ssize_t num_surrogates;
3872#endif
3873 wchar_t *w;
3874 wchar_t *wchar_end;
3875
3876 if (!PyUnicode_Check(unicode)) {
3877 PyErr_BadArgument();
3878 return NULL;
3879 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003880 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003881 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003882 assert(_PyUnicode_KIND(unicode) != 0);
3883 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003885 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003886#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003887 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3888 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003889 num_surrogates = 0;
3890
3891 for (; four_bytes < ucs4_end; ++four_bytes) {
3892 if (*four_bytes > 0xFFFF)
3893 ++num_surrogates;
3894 }
3895
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003896 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3897 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3898 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003899 PyErr_NoMemory();
3900 return NULL;
3901 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003902 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003903
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003904 w = _PyUnicode_WSTR(unicode);
3905 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3906 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3908 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003909 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003910 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003911 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3912 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003913 }
3914 else
3915 *w = *four_bytes;
3916
3917 if (w > wchar_end) {
3918 assert(0 && "Miscalculated string end");
3919 }
3920 }
3921 *w = 0;
3922#else
3923 /* sizeof(wchar_t) == 4 */
3924 Py_FatalError("Impossible unicode object state, wstr and str "
3925 "should share memory already.");
3926 return NULL;
3927#endif
3928 }
3929 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003930 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3931 (_PyUnicode_LENGTH(unicode) + 1));
3932 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003933 PyErr_NoMemory();
3934 return NULL;
3935 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003936 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3937 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3938 w = _PyUnicode_WSTR(unicode);
3939 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003940
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003941 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3942 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943 for (; w < wchar_end; ++one_byte, ++w)
3944 *w = *one_byte;
3945 /* null-terminate the wstr */
3946 *w = 0;
3947 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003948 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003950 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003951 for (; w < wchar_end; ++two_bytes, ++w)
3952 *w = *two_bytes;
3953 /* null-terminate the wstr */
3954 *w = 0;
3955#else
3956 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003957 PyObject_FREE(_PyUnicode_WSTR(unicode));
3958 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003959 Py_FatalError("Impossible unicode object state, wstr "
3960 "and str should share memory already.");
3961 return NULL;
3962#endif
3963 }
3964 else {
3965 assert(0 && "This should never happen.");
3966 }
3967 }
3968 }
3969 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003970 *size = PyUnicode_WSTR_LENGTH(unicode);
3971 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003972}
3973
Alexander Belopolsky40018472011-02-26 01:02:56 +00003974Py_UNICODE *
3975PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003977 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978}
3979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003980
Alexander Belopolsky40018472011-02-26 01:02:56 +00003981Py_ssize_t
3982PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983{
3984 if (!PyUnicode_Check(unicode)) {
3985 PyErr_BadArgument();
3986 goto onError;
3987 }
3988 return PyUnicode_GET_SIZE(unicode);
3989
Benjamin Peterson29060642009-01-31 22:14:21 +00003990 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991 return -1;
3992}
3993
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003994Py_ssize_t
3995PyUnicode_GetLength(PyObject *unicode)
3996{
Victor Stinner07621332012-06-16 04:53:46 +02003997 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998 PyErr_BadArgument();
3999 return -1;
4000 }
Victor Stinner07621332012-06-16 04:53:46 +02004001 if (PyUnicode_READY(unicode) == -1)
4002 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003 return PyUnicode_GET_LENGTH(unicode);
4004}
4005
4006Py_UCS4
4007PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4008{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004009 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4010 PyErr_BadArgument();
4011 return (Py_UCS4)-1;
4012 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004013 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004014 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004015 return (Py_UCS4)-1;
4016 }
4017 return PyUnicode_READ_CHAR(unicode, index);
4018}
4019
4020int
4021PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4022{
4023 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004024 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004025 return -1;
4026 }
Victor Stinner488fa492011-12-12 00:01:39 +01004027 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004028 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004029 PyErr_SetString(PyExc_IndexError, "string index out of range");
4030 return -1;
4031 }
Victor Stinner488fa492011-12-12 00:01:39 +01004032 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004033 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004034 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4035 PyErr_SetString(PyExc_ValueError, "character out of range");
4036 return -1;
4037 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4039 index, ch);
4040 return 0;
4041}
4042
Alexander Belopolsky40018472011-02-26 01:02:56 +00004043const char *
4044PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004045{
Victor Stinner42cb4622010-09-01 19:39:01 +00004046 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004047}
4048
Victor Stinner554f3f02010-06-16 23:33:54 +00004049/* create or adjust a UnicodeDecodeError */
4050static void
4051make_decode_exception(PyObject **exceptionObject,
4052 const char *encoding,
4053 const char *input, Py_ssize_t length,
4054 Py_ssize_t startpos, Py_ssize_t endpos,
4055 const char *reason)
4056{
4057 if (*exceptionObject == NULL) {
4058 *exceptionObject = PyUnicodeDecodeError_Create(
4059 encoding, input, length, startpos, endpos, reason);
4060 }
4061 else {
4062 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4063 goto onError;
4064 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4065 goto onError;
4066 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4067 goto onError;
4068 }
4069 return;
4070
4071onError:
4072 Py_DECREF(*exceptionObject);
4073 *exceptionObject = NULL;
4074}
4075
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004076/* error handling callback helper:
4077 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004078 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004079 and adjust various state variables.
4080 return 0 on success, -1 on error
4081*/
4082
Alexander Belopolsky40018472011-02-26 01:02:56 +00004083static int
4084unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004085 const char *encoding, const char *reason,
4086 const char **input, const char **inend, Py_ssize_t *startinpos,
4087 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004088 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004090 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004091
4092 PyObject *restuple = NULL;
4093 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004094 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004095 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004096 Py_ssize_t requiredsize;
4097 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004098 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004099 int res = -1;
4100
Victor Stinner596a6c42011-11-09 00:02:18 +01004101 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4102 outsize = PyUnicode_GET_LENGTH(*output);
4103 else
4104 outsize = _PyUnicode_WSTR_LENGTH(*output);
4105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004106 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004107 *errorHandler = PyCodec_LookupError(errors);
4108 if (*errorHandler == NULL)
4109 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004110 }
4111
Victor Stinner554f3f02010-06-16 23:33:54 +00004112 make_decode_exception(exceptionObject,
4113 encoding,
4114 *input, *inend - *input,
4115 *startinpos, *endinpos,
4116 reason);
4117 if (*exceptionObject == NULL)
4118 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119
4120 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4121 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004122 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004124 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004125 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004126 }
4127 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004128 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004129 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004130 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004131
4132 /* Copy back the bytes variables, which might have been modified by the
4133 callback */
4134 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4135 if (!inputobj)
4136 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004137 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004138 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004139 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004140 *input = PyBytes_AS_STRING(inputobj);
4141 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004142 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004143 /* we can DECREF safely, as the exception has another reference,
4144 so the object won't go away. */
4145 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004146
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004147 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004148 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004149 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004150 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4151 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004152 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004153
Victor Stinner596a6c42011-11-09 00:02:18 +01004154 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4155 /* need more space? (at least enough for what we
4156 have+the replacement+the rest of the string (starting
4157 at the new input position), so we won't have to check space
4158 when there are no errors in the rest of the string) */
4159 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4160 requiredsize = *outpos + replen + insize-newpos;
4161 if (requiredsize > outsize) {
4162 if (requiredsize<2*outsize)
4163 requiredsize = 2*outsize;
4164 if (unicode_resize(output, requiredsize) < 0)
4165 goto onError;
4166 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004167 if (unicode_widen(output, *outpos,
4168 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004169 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004170 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004171 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004173 else {
4174 wchar_t *repwstr;
4175 Py_ssize_t repwlen;
4176 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4177 if (repwstr == NULL)
4178 goto onError;
4179 /* need more space? (at least enough for what we
4180 have+the replacement+the rest of the string (starting
4181 at the new input position), so we won't have to check space
4182 when there are no errors in the rest of the string) */
4183 requiredsize = *outpos + repwlen + insize-newpos;
4184 if (requiredsize > outsize) {
4185 if (requiredsize < 2*outsize)
4186 requiredsize = 2*outsize;
4187 if (unicode_resize(output, requiredsize) < 0)
4188 goto onError;
4189 }
4190 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4191 *outpos += repwlen;
4192 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004194 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004195
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 /* we made it! */
4197 res = 0;
4198
Benjamin Peterson29060642009-01-31 22:14:21 +00004199 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004200 Py_XDECREF(restuple);
4201 return res;
4202}
4203
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004204/* --- UTF-7 Codec -------------------------------------------------------- */
4205
Antoine Pitrou244651a2009-05-04 18:56:13 +00004206/* See RFC2152 for details. We encode conservatively and decode liberally. */
4207
4208/* Three simple macros defining base-64. */
4209
4210/* Is c a base-64 character? */
4211
4212#define IS_BASE64(c) \
4213 (((c) >= 'A' && (c) <= 'Z') || \
4214 ((c) >= 'a' && (c) <= 'z') || \
4215 ((c) >= '0' && (c) <= '9') || \
4216 (c) == '+' || (c) == '/')
4217
4218/* given that c is a base-64 character, what is its base-64 value? */
4219
4220#define FROM_BASE64(c) \
4221 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4222 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4223 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4224 (c) == '+' ? 62 : 63)
4225
4226/* What is the base-64 character of the bottom 6 bits of n? */
4227
4228#define TO_BASE64(n) \
4229 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4230
4231/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4232 * decoded as itself. We are permissive on decoding; the only ASCII
4233 * byte not decoding to itself is the + which begins a base64
4234 * string. */
4235
4236#define DECODE_DIRECT(c) \
4237 ((c) <= 127 && (c) != '+')
4238
4239/* The UTF-7 encoder treats ASCII characters differently according to
4240 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4241 * the above). See RFC2152. This array identifies these different
4242 * sets:
4243 * 0 : "Set D"
4244 * alphanumeric and '(),-./:?
4245 * 1 : "Set O"
4246 * !"#$%&*;<=>@[]^_`{|}
4247 * 2 : "whitespace"
4248 * ht nl cr sp
4249 * 3 : special (must be base64 encoded)
4250 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4251 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004252
Tim Petersced69f82003-09-16 20:30:58 +00004253static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004254char utf7_category[128] = {
4255/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4256 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4257/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4258 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4259/* sp ! " # $ % & ' ( ) * + , - . / */
4260 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4261/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4262 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4263/* @ A B C D E F G H I J K L M N O */
4264 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4265/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4267/* ` a b c d e f g h i j k l m n o */
4268 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4269/* p q r s t u v w x y z { | } ~ del */
4270 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004271};
4272
Antoine Pitrou244651a2009-05-04 18:56:13 +00004273/* ENCODE_DIRECT: this character should be encoded as itself. The
4274 * answer depends on whether we are encoding set O as itself, and also
4275 * on whether we are encoding whitespace as itself. RFC2152 makes it
4276 * clear that the answers to these questions vary between
4277 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004278
Antoine Pitrou244651a2009-05-04 18:56:13 +00004279#define ENCODE_DIRECT(c, directO, directWS) \
4280 ((c) < 128 && (c) > 0 && \
4281 ((utf7_category[(c)] == 0) || \
4282 (directWS && (utf7_category[(c)] == 2)) || \
4283 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004284
Alexander Belopolsky40018472011-02-26 01:02:56 +00004285PyObject *
4286PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004287 Py_ssize_t size,
4288 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004289{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004290 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4291}
4292
Antoine Pitrou244651a2009-05-04 18:56:13 +00004293/* The decoder. The only state we preserve is our read position,
4294 * i.e. how many characters we have consumed. So if we end in the
4295 * middle of a shift sequence we have to back off the read position
4296 * and the output to the beginning of the sequence, otherwise we lose
4297 * all the shift state (seen bits, number of bits seen, high
4298 * surrogate). */
4299
Alexander Belopolsky40018472011-02-26 01:02:56 +00004300PyObject *
4301PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004302 Py_ssize_t size,
4303 const char *errors,
4304 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004305{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004306 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004307 Py_ssize_t startinpos;
4308 Py_ssize_t endinpos;
4309 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004310 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004311 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004312 const char *errmsg = "";
4313 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004314 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004315 unsigned int base64bits = 0;
4316 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004317 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004318 PyObject *errorHandler = NULL;
4319 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004320
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004321 /* Start off assuming it's all ASCII. Widen later as necessary. */
4322 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323 if (!unicode)
4324 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004325 if (size == 0) {
4326 if (consumed)
4327 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004328 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004329 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004330
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004331 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004332 e = s + size;
4333
4334 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004335 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004336 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004337 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004338
Antoine Pitrou244651a2009-05-04 18:56:13 +00004339 if (inShift) { /* in a base-64 section */
4340 if (IS_BASE64(ch)) { /* consume a base-64 character */
4341 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4342 base64bits += 6;
4343 s++;
4344 if (base64bits >= 16) {
4345 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004346 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004347 base64bits -= 16;
4348 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4349 if (surrogate) {
4350 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004351 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4352 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004353 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4354 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004355 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004356 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004357 }
4358 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004359 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4360 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004361 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004362 }
4363 }
Victor Stinner551ac952011-11-29 22:58:13 +01004364 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365 /* first surrogate */
4366 surrogate = outCh;
4367 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004369 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4370 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004371 }
4372 }
4373 }
4374 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004375 inShift = 0;
4376 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004377 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004378 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4379 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004380 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004381 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004382 if (base64bits > 0) { /* left-over bits */
4383 if (base64bits >= 6) {
4384 /* We've seen at least one base-64 character */
4385 errmsg = "partial character in shift sequence";
4386 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004387 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004388 else {
4389 /* Some bits remain; they should be zero */
4390 if (base64buffer != 0) {
4391 errmsg = "non-zero padding bits in shift sequence";
4392 goto utf7Error;
4393 }
4394 }
4395 }
4396 if (ch != '-') {
4397 /* '-' is absorbed; other terminating
4398 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004399 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4400 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004402 }
4403 }
4404 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004406 s++; /* consume '+' */
4407 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004408 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004409 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4410 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004411 }
4412 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004413 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004414 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004416 }
4417 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004419 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4420 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004421 s++;
4422 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004423 else {
4424 startinpos = s-starts;
4425 s++;
4426 errmsg = "unexpected special character";
4427 goto utf7Error;
4428 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004429 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004431 endinpos = s-starts;
4432 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004433 errors, &errorHandler,
4434 "utf7", errmsg,
4435 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004436 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004437 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438 }
4439
Antoine Pitrou244651a2009-05-04 18:56:13 +00004440 /* end of string */
4441
4442 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4443 /* if we're in an inconsistent state, that's an error */
4444 if (surrogate ||
4445 (base64bits >= 6) ||
4446 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004447 endinpos = size;
4448 if (unicode_decode_call_errorhandler(
4449 errors, &errorHandler,
4450 "utf7", "unterminated shift sequence",
4451 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004452 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004453 goto onError;
4454 if (s < e)
4455 goto restart;
4456 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004457 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004458
4459 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004460 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004461 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004462 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004463 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004464 }
4465 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004466 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004468 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004469
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004470 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004471 goto onError;
4472
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004473 Py_XDECREF(errorHandler);
4474 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004475 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004476
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 Py_XDECREF(errorHandler);
4479 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480 Py_DECREF(unicode);
4481 return NULL;
4482}
4483
4484
Alexander Belopolsky40018472011-02-26 01:02:56 +00004485PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004486_PyUnicode_EncodeUTF7(PyObject *str,
4487 int base64SetO,
4488 int base64WhiteSpace,
4489 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004490{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004491 int kind;
4492 void *data;
4493 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004494 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004495 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004496 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004497 unsigned int base64bits = 0;
4498 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004499 char * out;
4500 char * start;
4501
Benjamin Petersonbac79492012-01-14 13:34:47 -05004502 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004503 return NULL;
4504 kind = PyUnicode_KIND(str);
4505 data = PyUnicode_DATA(str);
4506 len = PyUnicode_GET_LENGTH(str);
4507
4508 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004510
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004511 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004512 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004513 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004514 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004515 if (v == NULL)
4516 return NULL;
4517
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004518 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004519 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004520 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521
Antoine Pitrou244651a2009-05-04 18:56:13 +00004522 if (inShift) {
4523 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4524 /* shifting out */
4525 if (base64bits) { /* output remaining bits */
4526 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4527 base64buffer = 0;
4528 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004529 }
4530 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004531 /* Characters not in the BASE64 set implicitly unshift the sequence
4532 so no '-' is required, except if the character is itself a '-' */
4533 if (IS_BASE64(ch) || ch == '-') {
4534 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536 *out++ = (char) ch;
4537 }
4538 else {
4539 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004540 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 else { /* not in a shift sequence */
4543 if (ch == '+') {
4544 *out++ = '+';
4545 *out++ = '-';
4546 }
4547 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4548 *out++ = (char) ch;
4549 }
4550 else {
4551 *out++ = '+';
4552 inShift = 1;
4553 goto encode_char;
4554 }
4555 }
4556 continue;
4557encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004558 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004559 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004560
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 /* code first surrogate */
4562 base64bits += 16;
4563 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4564 while (base64bits >= 6) {
4565 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4566 base64bits -= 6;
4567 }
4568 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004569 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004571 base64bits += 16;
4572 base64buffer = (base64buffer << 16) | ch;
4573 while (base64bits >= 6) {
4574 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4575 base64bits -= 6;
4576 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004577 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 if (base64bits)
4579 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4580 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004581 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004582 if (_PyBytes_Resize(&v, out - start) < 0)
4583 return NULL;
4584 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004585}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004586PyObject *
4587PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4588 Py_ssize_t size,
4589 int base64SetO,
4590 int base64WhiteSpace,
4591 const char *errors)
4592{
4593 PyObject *result;
4594 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4595 if (tmp == NULL)
4596 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004597 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004598 base64WhiteSpace, errors);
4599 Py_DECREF(tmp);
4600 return result;
4601}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004602
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603#undef IS_BASE64
4604#undef FROM_BASE64
4605#undef TO_BASE64
4606#undef DECODE_DIRECT
4607#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004608
Guido van Rossumd57fd912000-03-10 22:53:23 +00004609/* --- UTF-8 Codec -------------------------------------------------------- */
4610
Alexander Belopolsky40018472011-02-26 01:02:56 +00004611PyObject *
4612PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004613 Py_ssize_t size,
4614 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615{
Walter Dörwald69652032004-09-07 20:24:22 +00004616 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4617}
4618
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004619#include "stringlib/asciilib.h"
4620#include "stringlib/codecs.h"
4621#include "stringlib/undef.h"
4622
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004623#include "stringlib/ucs1lib.h"
4624#include "stringlib/codecs.h"
4625#include "stringlib/undef.h"
4626
4627#include "stringlib/ucs2lib.h"
4628#include "stringlib/codecs.h"
4629#include "stringlib/undef.h"
4630
4631#include "stringlib/ucs4lib.h"
4632#include "stringlib/codecs.h"
4633#include "stringlib/undef.h"
4634
Antoine Pitrouab868312009-01-10 15:40:25 +00004635/* Mask to quickly check whether a C 'long' contains a
4636 non-ASCII, UTF8-encoded char. */
4637#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004638# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004639#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004640# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004641#else
4642# error C 'long' size should be either 4 or 8!
4643#endif
4644
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004645static Py_ssize_t
4646ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004647{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004648 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004649 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004650
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004651#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004652 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4653 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004654 /* Fast path, see in STRINGLIB(utf8_decode) for
4655 an explanation. */
4656 /* Help register allocation */
4657 register const char *_p = p;
4658 register Py_UCS1 * q = dest;
4659 while (_p < aligned_end) {
4660 unsigned long value = *(const unsigned long *) _p;
4661 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004662 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004663 *((unsigned long *)q) = value;
4664 _p += SIZEOF_LONG;
4665 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004666 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004667 p = _p;
4668 while (p < end) {
4669 if ((unsigned char)*p & 0x80)
4670 break;
4671 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004673 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004675#endif
4676 while (p < end) {
4677 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4678 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004679 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004680 /* Help register allocation */
4681 register const char *_p = p;
4682 while (_p < aligned_end) {
4683 unsigned long value = *(unsigned long *) _p;
4684 if (value & ASCII_CHAR_MASK)
4685 break;
4686 _p += SIZEOF_LONG;
4687 }
4688 p = _p;
4689 if (_p == end)
4690 break;
4691 }
4692 if ((unsigned char)*p & 0x80)
4693 break;
4694 ++p;
4695 }
4696 memcpy(dest, start, p - start);
4697 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698}
Antoine Pitrouab868312009-01-10 15:40:25 +00004699
Victor Stinner785938e2011-12-11 20:09:03 +01004700PyObject *
4701PyUnicode_DecodeUTF8Stateful(const char *s,
4702 Py_ssize_t size,
4703 const char *errors,
4704 Py_ssize_t *consumed)
4705{
Victor Stinner785938e2011-12-11 20:09:03 +01004706 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004707 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004708 const char *end = s + size;
4709 Py_ssize_t outpos;
4710
4711 Py_ssize_t startinpos;
4712 Py_ssize_t endinpos;
4713 const char *errmsg = "";
4714 PyObject *errorHandler = NULL;
4715 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004716
4717 if (size == 0) {
4718 if (consumed)
4719 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004720 Py_INCREF(unicode_empty);
4721 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004722 }
4723
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004724 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4725 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004726 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004727 *consumed = 1;
4728 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004729 }
4730
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004731 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004732 if (!unicode)
4733 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004734
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004735 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4736 s += outpos;
4737 while (s < end) {
4738 Py_UCS4 ch;
4739 int kind = PyUnicode_KIND(unicode);
4740 if (kind == PyUnicode_1BYTE_KIND) {
4741 if (PyUnicode_IS_ASCII(unicode))
4742 ch = asciilib_utf8_decode(&s, end,
4743 PyUnicode_1BYTE_DATA(unicode), &outpos);
4744 else
4745 ch = ucs1lib_utf8_decode(&s, end,
4746 PyUnicode_1BYTE_DATA(unicode), &outpos);
4747 } else if (kind == PyUnicode_2BYTE_KIND) {
4748 ch = ucs2lib_utf8_decode(&s, end,
4749 PyUnicode_2BYTE_DATA(unicode), &outpos);
4750 } else {
4751 assert(kind == PyUnicode_4BYTE_KIND);
4752 ch = ucs4lib_utf8_decode(&s, end,
4753 PyUnicode_4BYTE_DATA(unicode), &outpos);
4754 }
4755
4756 switch (ch) {
4757 case 0:
4758 if (s == end || consumed)
4759 goto End;
4760 errmsg = "unexpected end of data";
4761 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004762 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004763 break;
4764 case 1:
4765 errmsg = "invalid start byte";
4766 startinpos = s - starts;
4767 endinpos = startinpos + 1;
4768 break;
4769 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004770 case 3:
4771 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004772 errmsg = "invalid continuation byte";
4773 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004774 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004775 break;
4776 default:
4777 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4778 goto onError;
4779 continue;
4780 }
4781
4782 if (unicode_decode_call_errorhandler(
4783 errors, &errorHandler,
4784 "utf-8", errmsg,
4785 &starts, &end, &startinpos, &endinpos, &exc, &s,
4786 &unicode, &outpos))
4787 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004788 }
4789
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004790End:
4791 if (unicode_resize(&unicode, outpos) < 0)
4792 goto onError;
4793
4794 if (consumed)
4795 *consumed = s - starts;
4796
4797 Py_XDECREF(errorHandler);
4798 Py_XDECREF(exc);
4799 assert(_PyUnicode_CheckConsistency(unicode, 1));
4800 return unicode;
4801
4802onError:
4803 Py_XDECREF(errorHandler);
4804 Py_XDECREF(exc);
4805 Py_XDECREF(unicode);
4806 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004807}
4808
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004809#ifdef __APPLE__
4810
4811/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner27b1ca22012-12-03 12:47:59 +01004812 used to decode the command line arguments on Mac OS X.
4813
4814 Return a pointer to a newly allocated wide character string (use
4815 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004816
4817wchar_t*
4818_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4819{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004820 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004821 wchar_t *unicode;
4822 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004823
4824 /* Note: size will always be longer than the resulting Unicode
4825 character count */
Victor Stinner27b1ca22012-12-03 12:47:59 +01004826 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004827 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004828 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4829 if (!unicode)
4830 return NULL;
4831
4832 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004833 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004834 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004835 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004836 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004837#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004838 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004839#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004840 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004841#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004842 if (ch > 0xFF) {
4843#if SIZEOF_WCHAR_T == 4
4844 assert(0);
4845#else
4846 assert(Py_UNICODE_IS_SURROGATE(ch));
4847 /* compute and append the two surrogates: */
4848 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4849 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4850#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004851 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004852 else {
4853 if (!ch && s == e)
4854 break;
4855 /* surrogateescape */
4856 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4857 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004858 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004860 return unicode;
4861}
4862
4863#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004865/* Primary internal function which creates utf8 encoded bytes objects.
4866
4867 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004868 and allocate exactly as much space needed at the end. Else allocate the
4869 maximum possible needed (4 result bytes per Unicode character), and return
4870 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004871*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004872PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004873_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874{
Victor Stinner6099a032011-12-18 14:22:26 +01004875 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004876 void *data;
4877 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004879 if (!PyUnicode_Check(unicode)) {
4880 PyErr_BadArgument();
4881 return NULL;
4882 }
4883
4884 if (PyUnicode_READY(unicode) == -1)
4885 return NULL;
4886
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004887 if (PyUnicode_UTF8(unicode))
4888 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4889 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004890
4891 kind = PyUnicode_KIND(unicode);
4892 data = PyUnicode_DATA(unicode);
4893 size = PyUnicode_GET_LENGTH(unicode);
4894
Benjamin Petersonead6b532011-12-20 17:23:42 -06004895 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004896 default:
4897 assert(0);
4898 case PyUnicode_1BYTE_KIND:
4899 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4900 assert(!PyUnicode_IS_ASCII(unicode));
4901 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4902 case PyUnicode_2BYTE_KIND:
4903 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4904 case PyUnicode_4BYTE_KIND:
4905 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004906 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907}
4908
Alexander Belopolsky40018472011-02-26 01:02:56 +00004909PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004910PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4911 Py_ssize_t size,
4912 const char *errors)
4913{
4914 PyObject *v, *unicode;
4915
4916 unicode = PyUnicode_FromUnicode(s, size);
4917 if (unicode == NULL)
4918 return NULL;
4919 v = _PyUnicode_AsUTF8String(unicode, errors);
4920 Py_DECREF(unicode);
4921 return v;
4922}
4923
4924PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004925PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004926{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004927 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928}
4929
Walter Dörwald41980ca2007-08-16 21:55:45 +00004930/* --- UTF-32 Codec ------------------------------------------------------- */
4931
4932PyObject *
4933PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004934 Py_ssize_t size,
4935 const char *errors,
4936 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004937{
4938 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4939}
4940
4941PyObject *
4942PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004943 Py_ssize_t size,
4944 const char *errors,
4945 int *byteorder,
4946 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004947{
4948 const char *starts = s;
4949 Py_ssize_t startinpos;
4950 Py_ssize_t endinpos;
4951 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004952 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004953 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004954 int bo = 0; /* assume native ordering by default */
4955 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004956 /* Offsets from q for retrieving bytes in the right order. */
4957#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4958 int iorder[] = {0, 1, 2, 3};
4959#else
4960 int iorder[] = {3, 2, 1, 0};
4961#endif
4962 PyObject *errorHandler = NULL;
4963 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004964
Walter Dörwald41980ca2007-08-16 21:55:45 +00004965 q = (unsigned char *)s;
4966 e = q + size;
4967
4968 if (byteorder)
4969 bo = *byteorder;
4970
4971 /* Check for BOM marks (U+FEFF) in the input and adjust current
4972 byte order setting accordingly. In native mode, the leading BOM
4973 mark is skipped, in all other modes, it is copied to the output
4974 stream as-is (giving a ZWNBSP character). */
4975 if (bo == 0) {
4976 if (size >= 4) {
4977 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004978 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004979#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004980 if (bom == 0x0000FEFF) {
4981 q += 4;
4982 bo = -1;
4983 }
4984 else if (bom == 0xFFFE0000) {
4985 q += 4;
4986 bo = 1;
4987 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004988#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004989 if (bom == 0x0000FEFF) {
4990 q += 4;
4991 bo = 1;
4992 }
4993 else if (bom == 0xFFFE0000) {
4994 q += 4;
4995 bo = -1;
4996 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004997#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004999 }
5000
5001 if (bo == -1) {
5002 /* force LE */
5003 iorder[0] = 0;
5004 iorder[1] = 1;
5005 iorder[2] = 2;
5006 iorder[3] = 3;
5007 }
5008 else if (bo == 1) {
5009 /* force BE */
5010 iorder[0] = 3;
5011 iorder[1] = 2;
5012 iorder[2] = 1;
5013 iorder[3] = 0;
5014 }
5015
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005016 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005017 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005018 if (!unicode)
5019 return NULL;
5020 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005021 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005022 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005023
Walter Dörwald41980ca2007-08-16 21:55:45 +00005024 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 Py_UCS4 ch;
5026 /* remaining bytes at the end? (size should be divisible by 4) */
5027 if (e-q<4) {
5028 if (consumed)
5029 break;
5030 errmsg = "truncated data";
5031 startinpos = ((const char *)q)-starts;
5032 endinpos = ((const char *)e)-starts;
5033 goto utf32Error;
5034 /* The remaining input chars are ignored if the callback
5035 chooses to skip the input */
5036 }
5037 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5038 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005039
Benjamin Peterson29060642009-01-31 22:14:21 +00005040 if (ch >= 0x110000)
5041 {
5042 errmsg = "codepoint not in range(0x110000)";
5043 startinpos = ((const char *)q)-starts;
5044 endinpos = startinpos+4;
5045 goto utf32Error;
5046 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005047 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5048 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005049 q += 4;
5050 continue;
5051 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 if (unicode_decode_call_errorhandler(
5053 errors, &errorHandler,
5054 "utf32", errmsg,
5055 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005056 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005057 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005058 }
5059
5060 if (byteorder)
5061 *byteorder = bo;
5062
5063 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005065
5066 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005067 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005068 goto onError;
5069
5070 Py_XDECREF(errorHandler);
5071 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005072 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005073
Benjamin Peterson29060642009-01-31 22:14:21 +00005074 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075 Py_DECREF(unicode);
5076 Py_XDECREF(errorHandler);
5077 Py_XDECREF(exc);
5078 return NULL;
5079}
5080
5081PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005082_PyUnicode_EncodeUTF32(PyObject *str,
5083 const char *errors,
5084 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005085{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005086 int kind;
5087 void *data;
5088 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005089 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005090 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005091 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005092 /* Offsets from p for storing byte pairs in the right order. */
5093#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5094 int iorder[] = {0, 1, 2, 3};
5095#else
5096 int iorder[] = {3, 2, 1, 0};
5097#endif
5098
Benjamin Peterson29060642009-01-31 22:14:21 +00005099#define STORECHAR(CH) \
5100 do { \
5101 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5102 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5103 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5104 p[iorder[0]] = (CH) & 0xff; \
5105 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005106 } while(0)
5107
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005108 if (!PyUnicode_Check(str)) {
5109 PyErr_BadArgument();
5110 return NULL;
5111 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005112 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005113 return NULL;
5114 kind = PyUnicode_KIND(str);
5115 data = PyUnicode_DATA(str);
5116 len = PyUnicode_GET_LENGTH(str);
5117
5118 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005119 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005120 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005121 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005122 if (v == NULL)
5123 return NULL;
5124
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005125 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005126 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005127 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005128 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005129 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005130
5131 if (byteorder == -1) {
5132 /* force LE */
5133 iorder[0] = 0;
5134 iorder[1] = 1;
5135 iorder[2] = 2;
5136 iorder[3] = 3;
5137 }
5138 else if (byteorder == 1) {
5139 /* force BE */
5140 iorder[0] = 3;
5141 iorder[1] = 2;
5142 iorder[2] = 1;
5143 iorder[3] = 0;
5144 }
5145
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005146 for (i = 0; i < len; i++)
5147 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005148
5149 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005150 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005151#undef STORECHAR
5152}
5153
Alexander Belopolsky40018472011-02-26 01:02:56 +00005154PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005155PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5156 Py_ssize_t size,
5157 const char *errors,
5158 int byteorder)
5159{
5160 PyObject *result;
5161 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5162 if (tmp == NULL)
5163 return NULL;
5164 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5165 Py_DECREF(tmp);
5166 return result;
5167}
5168
5169PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005170PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005171{
Victor Stinnerb960b342011-11-20 19:12:52 +01005172 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005173}
5174
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175/* --- UTF-16 Codec ------------------------------------------------------- */
5176
Tim Peters772747b2001-08-09 22:21:55 +00005177PyObject *
5178PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005179 Py_ssize_t size,
5180 const char *errors,
5181 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182{
Walter Dörwald69652032004-09-07 20:24:22 +00005183 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5184}
5185
5186PyObject *
5187PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005188 Py_ssize_t size,
5189 const char *errors,
5190 int *byteorder,
5191 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005192{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005193 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005194 Py_ssize_t startinpos;
5195 Py_ssize_t endinpos;
5196 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005197 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005198 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005199 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005200 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005201 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005202 PyObject *errorHandler = NULL;
5203 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204
Tim Peters772747b2001-08-09 22:21:55 +00005205 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005206 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207
5208 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005209 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005211 /* Check for BOM marks (U+FEFF) in the input and adjust current
5212 byte order setting accordingly. In native mode, the leading BOM
5213 mark is skipped, in all other modes, it is copied to the output
5214 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005215 if (bo == 0 && size >= 2) {
5216 const Py_UCS4 bom = (q[1] << 8) | q[0];
5217 if (bom == 0xFEFF) {
5218 q += 2;
5219 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005220 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005221 else if (bom == 0xFFFE) {
5222 q += 2;
5223 bo = 1;
5224 }
5225 if (byteorder)
5226 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005227 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228
Antoine Pitrou63065d72012-05-15 23:48:04 +02005229 if (q == e) {
5230 if (consumed)
5231 *consumed = size;
5232 Py_INCREF(unicode_empty);
5233 return unicode_empty;
Tim Peters772747b2001-08-09 22:21:55 +00005234 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005235
Antoine Pitrouab868312009-01-10 15:40:25 +00005236#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005237 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005238#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005239 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005240#endif
Tim Peters772747b2001-08-09 22:21:55 +00005241
Antoine Pitrou63065d72012-05-15 23:48:04 +02005242 /* Note: size will always be longer than the resulting Unicode
5243 character count */
5244 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5245 if (!unicode)
5246 return NULL;
5247
5248 outpos = 0;
5249 while (1) {
5250 Py_UCS4 ch = 0;
5251 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005252 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005253 if (kind == PyUnicode_1BYTE_KIND) {
5254 if (PyUnicode_IS_ASCII(unicode))
5255 ch = asciilib_utf16_decode(&q, e,
5256 PyUnicode_1BYTE_DATA(unicode), &outpos,
5257 native_ordering);
5258 else
5259 ch = ucs1lib_utf16_decode(&q, e,
5260 PyUnicode_1BYTE_DATA(unicode), &outpos,
5261 native_ordering);
5262 } else if (kind == PyUnicode_2BYTE_KIND) {
5263 ch = ucs2lib_utf16_decode(&q, e,
5264 PyUnicode_2BYTE_DATA(unicode), &outpos,
5265 native_ordering);
5266 } else {
5267 assert(kind == PyUnicode_4BYTE_KIND);
5268 ch = ucs4lib_utf16_decode(&q, e,
5269 PyUnicode_4BYTE_DATA(unicode), &outpos,
5270 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005271 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005272 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005273
Antoine Pitrou63065d72012-05-15 23:48:04 +02005274 switch (ch)
5275 {
5276 case 0:
5277 /* remaining byte at the end? (size should be even) */
5278 if (q == e || consumed)
5279 goto End;
5280 errmsg = "truncated data";
5281 startinpos = ((const char *)q) - starts;
5282 endinpos = ((const char *)e) - starts;
5283 break;
5284 /* The remaining input chars are ignored if the callback
5285 chooses to skip the input */
5286 case 1:
5287 errmsg = "unexpected end of data";
5288 startinpos = ((const char *)q) - 2 - starts;
5289 endinpos = ((const char *)e) - starts;
5290 break;
5291 case 2:
5292 errmsg = "illegal encoding";
5293 startinpos = ((const char *)q) - 2 - starts;
5294 endinpos = startinpos + 2;
5295 break;
5296 case 3:
5297 errmsg = "illegal UTF-16 surrogate";
5298 startinpos = ((const char *)q) - 4 - starts;
5299 endinpos = startinpos + 2;
5300 break;
5301 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005302 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5303 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005304 continue;
5305 }
5306
Benjamin Peterson29060642009-01-31 22:14:21 +00005307 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005308 errors,
5309 &errorHandler,
5310 "utf16", errmsg,
5311 &starts,
5312 (const char **)&e,
5313 &startinpos,
5314 &endinpos,
5315 &exc,
5316 (const char **)&q,
5317 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005318 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005319 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005320 }
5321
Antoine Pitrou63065d72012-05-15 23:48:04 +02005322End:
Walter Dörwald69652032004-09-07 20:24:22 +00005323 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005324 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005325
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005327 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328 goto onError;
5329
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005330 Py_XDECREF(errorHandler);
5331 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005332 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005336 Py_XDECREF(errorHandler);
5337 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 return NULL;
5339}
5340
Tim Peters772747b2001-08-09 22:21:55 +00005341PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005342_PyUnicode_EncodeUTF16(PyObject *str,
5343 const char *errors,
5344 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005346 enum PyUnicode_Kind kind;
5347 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005348 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005349 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005350 unsigned short *out;
5351 Py_ssize_t bytesize;
5352 Py_ssize_t pairs;
5353#ifdef WORDS_BIGENDIAN
5354 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005355#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005356 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005357#endif
5358
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005359 if (!PyUnicode_Check(str)) {
5360 PyErr_BadArgument();
5361 return NULL;
5362 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005363 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005364 return NULL;
5365 kind = PyUnicode_KIND(str);
5366 data = PyUnicode_DATA(str);
5367 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005368
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005369 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005370 if (kind == PyUnicode_4BYTE_KIND) {
5371 const Py_UCS4 *in = (const Py_UCS4 *)data;
5372 const Py_UCS4 *end = in + len;
5373 while (in < end)
5374 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005375 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005376 }
5377 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005379 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005380 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 if (v == NULL)
5382 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005384 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005385 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005386 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005388 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005389 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005390 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005391
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005392 switch (kind) {
5393 case PyUnicode_1BYTE_KIND: {
5394 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5395 break;
Tim Peters772747b2001-08-09 22:21:55 +00005396 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005397 case PyUnicode_2BYTE_KIND: {
5398 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5399 break;
Tim Peters772747b2001-08-09 22:21:55 +00005400 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005401 case PyUnicode_4BYTE_KIND: {
5402 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5403 break;
5404 }
5405 default:
5406 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005407 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005408
5409 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005410 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411}
5412
Alexander Belopolsky40018472011-02-26 01:02:56 +00005413PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005414PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5415 Py_ssize_t size,
5416 const char *errors,
5417 int byteorder)
5418{
5419 PyObject *result;
5420 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5421 if (tmp == NULL)
5422 return NULL;
5423 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5424 Py_DECREF(tmp);
5425 return result;
5426}
5427
5428PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005429PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005431 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432}
5433
5434/* --- Unicode Escape Codec ----------------------------------------------- */
5435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005436/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5437 if all the escapes in the string make it still a valid ASCII string.
5438 Returns -1 if any escapes were found which cause the string to
5439 pop out of ASCII range. Otherwise returns the length of the
5440 required buffer to hold the string.
5441 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005442static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005443length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5444{
5445 const unsigned char *p = (const unsigned char *)s;
5446 const unsigned char *end = p + size;
5447 Py_ssize_t length = 0;
5448
5449 if (size < 0)
5450 return -1;
5451
5452 for (; p < end; ++p) {
5453 if (*p > 127) {
5454 /* Non-ASCII */
5455 return -1;
5456 }
5457 else if (*p != '\\') {
5458 /* Normal character */
5459 ++length;
5460 }
5461 else {
5462 /* Backslash-escape, check next char */
5463 ++p;
5464 /* Escape sequence reaches till end of string or
5465 non-ASCII follow-up. */
5466 if (p >= end || *p > 127)
5467 return -1;
5468 switch (*p) {
5469 case '\n':
5470 /* backslash + \n result in zero characters */
5471 break;
5472 case '\\': case '\'': case '\"':
5473 case 'b': case 'f': case 't':
5474 case 'n': case 'r': case 'v': case 'a':
5475 ++length;
5476 break;
5477 case '0': case '1': case '2': case '3':
5478 case '4': case '5': case '6': case '7':
5479 case 'x': case 'u': case 'U': case 'N':
5480 /* these do not guarantee ASCII characters */
5481 return -1;
5482 default:
5483 /* count the backslash + the other character */
5484 length += 2;
5485 }
5486 }
5487 }
5488 return length;
5489}
5490
Fredrik Lundh06d12682001-01-24 07:59:11 +00005491static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005492
Alexander Belopolsky40018472011-02-26 01:02:56 +00005493PyObject *
5494PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005495 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005496 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005498 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005499 Py_ssize_t startinpos;
5500 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005501 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005502 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005504 char* message;
5505 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005506 PyObject *errorHandler = NULL;
5507 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005508 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005509 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005510
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005511 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005512
5513 /* After length_of_escaped_ascii_string() there are two alternatives,
5514 either the string is pure ASCII with named escapes like \n, etc.
5515 and we determined it's exact size (common case)
5516 or it contains \x, \u, ... escape sequences. then we create a
5517 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005518 if (len >= 0) {
5519 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005520 if (!v)
5521 goto onError;
5522 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005523 }
5524 else {
5525 /* Escaped strings will always be longer than the resulting
5526 Unicode string, so we start with size here and then reduce the
5527 length after conversion to the true value.
5528 (but if the error callback returns a long replacement string
5529 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005530 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005531 if (!v)
5532 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005533 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005534 }
5535
Guido van Rossumd57fd912000-03-10 22:53:23 +00005536 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005537 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005538 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005540
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 while (s < end) {
5542 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005543 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005544 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005546 /* The only case in which i == ascii_length is a backslash
5547 followed by a newline. */
5548 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005549
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550 /* Non-escape characters are interpreted as Unicode ordinals */
5551 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005552 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5553 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 continue;
5555 }
5556
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005557 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558 /* \ - Escapes */
5559 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005560 c = *s++;
5561 if (s > end)
5562 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005563
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005564 /* The only case in which i == ascii_length is a backslash
5565 followed by a newline. */
5566 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005567
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005568 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569
Benjamin Peterson29060642009-01-31 22:14:21 +00005570 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005571#define WRITECHAR(ch) \
5572 do { \
5573 if (unicode_putchar(&v, &i, ch) < 0) \
5574 goto onError; \
5575 }while(0)
5576
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005578 case '\\': WRITECHAR('\\'); break;
5579 case '\'': WRITECHAR('\''); break;
5580 case '\"': WRITECHAR('\"'); break;
5581 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005582 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005583 case 'f': WRITECHAR('\014'); break;
5584 case 't': WRITECHAR('\t'); break;
5585 case 'n': WRITECHAR('\n'); break;
5586 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005587 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005588 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005589 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005590 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 case '0': case '1': case '2': case '3':
5594 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005595 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005596 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005597 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005598 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005599 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005601 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602 break;
5603
Benjamin Peterson29060642009-01-31 22:14:21 +00005604 /* hex escapes */
5605 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005607 digits = 2;
5608 message = "truncated \\xXX escape";
5609 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610
Benjamin Peterson29060642009-01-31 22:14:21 +00005611 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005613 digits = 4;
5614 message = "truncated \\uXXXX escape";
5615 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005618 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005619 digits = 8;
5620 message = "truncated \\UXXXXXXXX escape";
5621 hexescape:
5622 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005623 if (s+digits>end) {
5624 endinpos = size;
5625 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 errors, &errorHandler,
5627 "unicodeescape", "end of string in escape sequence",
5628 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005629 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005630 goto onError;
5631 goto nextByte;
5632 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005633 for (j = 0; j < digits; ++j) {
5634 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005635 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005636 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005637 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005638 errors, &errorHandler,
5639 "unicodeescape", message,
5640 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005641 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005642 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005643 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005644 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005645 }
5646 chr = (chr<<4) & ~0xF;
5647 if (c >= '0' && c <= '9')
5648 chr += c - '0';
5649 else if (c >= 'a' && c <= 'f')
5650 chr += 10 + c - 'a';
5651 else
5652 chr += 10 + c - 'A';
5653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005654 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005655 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005656 /* _decoding_error will have already written into the
5657 target buffer. */
5658 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005659 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005660 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005661 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005662 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005663 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005664 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005665 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005666 errors, &errorHandler,
5667 "unicodeescape", "illegal Unicode character",
5668 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005669 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005670 goto onError;
5671 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005672 break;
5673
Benjamin Peterson29060642009-01-31 22:14:21 +00005674 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005675 case 'N':
5676 message = "malformed \\N character escape";
5677 if (ucnhash_CAPI == NULL) {
5678 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005679 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5680 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005681 if (ucnhash_CAPI == NULL)
5682 goto ucnhashError;
5683 }
5684 if (*s == '{') {
5685 const char *start = s+1;
5686 /* look for the closing brace */
5687 while (*s != '}' && s < end)
5688 s++;
5689 if (s > start && s < end && *s == '}') {
5690 /* found a name. look it up in the unicode database */
5691 message = "unknown Unicode character name";
5692 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005693 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005694 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005695 goto store;
5696 }
5697 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005698 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005699 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005700 errors, &errorHandler,
5701 "unicodeescape", message,
5702 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005703 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005704 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005705 break;
5706
5707 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005708 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709 message = "\\ at end of string";
5710 s--;
5711 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005713 errors, &errorHandler,
5714 "unicodeescape", message,
5715 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005716 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005717 goto onError;
5718 }
5719 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005720 WRITECHAR('\\');
5721 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005722 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005723 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005726 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005728#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005729
Victor Stinner16e6a802011-12-12 13:24:15 +01005730 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005731 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005732 Py_XDECREF(errorHandler);
5733 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005734 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005735
Benjamin Peterson29060642009-01-31 22:14:21 +00005736 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005737 PyErr_SetString(
5738 PyExc_UnicodeError,
5739 "\\N escapes not supported (can't load unicodedata module)"
5740 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005741 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005742 Py_XDECREF(errorHandler);
5743 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005744 return NULL;
5745
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005748 Py_XDECREF(errorHandler);
5749 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750 return NULL;
5751}
5752
5753/* Return a Unicode-Escape string version of the Unicode object.
5754
5755 If quotes is true, the string is enclosed in u"" or u'' quotes as
5756 appropriate.
5757
5758*/
5759
Alexander Belopolsky40018472011-02-26 01:02:56 +00005760PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005761PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005763 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005764 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005766 int kind;
5767 void *data;
5768 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769
Ezio Melottie7f90372012-10-05 03:33:31 +03005770 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005771 escape.
5772
Ezio Melottie7f90372012-10-05 03:33:31 +03005773 For UCS1 strings it's '\xxx', 4 bytes per source character.
5774 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5775 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005776 */
5777
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005778 if (!PyUnicode_Check(unicode)) {
5779 PyErr_BadArgument();
5780 return NULL;
5781 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005782 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005783 return NULL;
5784 len = PyUnicode_GET_LENGTH(unicode);
5785 kind = PyUnicode_KIND(unicode);
5786 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005787 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005788 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5789 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5790 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5791 }
5792
5793 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005794 return PyBytes_FromStringAndSize(NULL, 0);
5795
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005796 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005797 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005798
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005799 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005800 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005801 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005802 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803 if (repr == NULL)
5804 return NULL;
5805
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005806 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005807
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005808 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005809 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005810
Walter Dörwald79e913e2007-05-12 11:08:06 +00005811 /* Escape backslashes */
5812 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 *p++ = '\\';
5814 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005815 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005816 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005817
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005818 /* Map 21-bit characters to '\U00xxxxxx' */
5819 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005820 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005821 *p++ = '\\';
5822 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005823 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5824 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5825 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5826 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5827 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5828 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5829 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5830 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005832 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005833
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005835 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 *p++ = '\\';
5837 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005838 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5839 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5840 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5841 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005842 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005843
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005844 /* Map special whitespace to '\t', \n', '\r' */
5845 else if (ch == '\t') {
5846 *p++ = '\\';
5847 *p++ = 't';
5848 }
5849 else if (ch == '\n') {
5850 *p++ = '\\';
5851 *p++ = 'n';
5852 }
5853 else if (ch == '\r') {
5854 *p++ = '\\';
5855 *p++ = 'r';
5856 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005857
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005858 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005859 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005861 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005862 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5863 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005864 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005865
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 /* Copy everything else as-is */
5867 else
5868 *p++ = (char) ch;
5869 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005870
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005871 assert(p - PyBytes_AS_STRING(repr) > 0);
5872 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5873 return NULL;
5874 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875}
5876
Alexander Belopolsky40018472011-02-26 01:02:56 +00005877PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005878PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5879 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005880{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005881 PyObject *result;
5882 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5883 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005885 result = PyUnicode_AsUnicodeEscapeString(tmp);
5886 Py_DECREF(tmp);
5887 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888}
5889
5890/* --- Raw Unicode Escape Codec ------------------------------------------- */
5891
Alexander Belopolsky40018472011-02-26 01:02:56 +00005892PyObject *
5893PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005894 Py_ssize_t size,
5895 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005897 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005898 Py_ssize_t startinpos;
5899 Py_ssize_t endinpos;
5900 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005901 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 const char *end;
5903 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005904 PyObject *errorHandler = NULL;
5905 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005906
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 /* Escaped strings will always be longer than the resulting
5908 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005909 length after conversion to the true value. (But decoding error
5910 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005911 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005915 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005916 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 end = s + size;
5918 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005919 unsigned char c;
5920 Py_UCS4 x;
5921 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005922 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923
Benjamin Peterson29060642009-01-31 22:14:21 +00005924 /* Non-escape characters are interpreted as Unicode ordinals */
5925 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005926 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5927 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005929 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005930 startinpos = s-starts;
5931
5932 /* \u-escapes are only interpreted iff the number of leading
5933 backslashes if odd */
5934 bs = s;
5935 for (;s < end;) {
5936 if (*s != '\\')
5937 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005938 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5939 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 }
5941 if (((s - bs) & 1) == 0 ||
5942 s >= end ||
5943 (*s != 'u' && *s != 'U')) {
5944 continue;
5945 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005946 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 count = *s=='u' ? 4 : 8;
5948 s++;
5949
5950 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005951 for (x = 0, i = 0; i < count; ++i, ++s) {
5952 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005953 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005954 endinpos = s-starts;
5955 if (unicode_decode_call_errorhandler(
5956 errors, &errorHandler,
5957 "rawunicodeescape", "truncated \\uXXXX",
5958 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005959 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005960 goto onError;
5961 goto nextByte;
5962 }
5963 x = (x<<4) & ~0xF;
5964 if (c >= '0' && c <= '9')
5965 x += c - '0';
5966 else if (c >= 'a' && c <= 'f')
5967 x += 10 + c - 'a';
5968 else
5969 x += 10 + c - 'A';
5970 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005971 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005972 if (unicode_putchar(&v, &outpos, x) < 0)
5973 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005974 } else {
5975 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005976 if (unicode_decode_call_errorhandler(
5977 errors, &errorHandler,
5978 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005979 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005980 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005982 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 nextByte:
5984 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 }
Victor Stinner16e6a802011-12-12 13:24:15 +01005986 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005988 Py_XDECREF(errorHandler);
5989 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005990 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00005991
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005994 Py_XDECREF(errorHandler);
5995 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 return NULL;
5997}
5998
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005999
Alexander Belopolsky40018472011-02-26 01:02:56 +00006000PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006001PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006003 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 char *p;
6005 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006006 Py_ssize_t expandsize, pos;
6007 int kind;
6008 void *data;
6009 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006011 if (!PyUnicode_Check(unicode)) {
6012 PyErr_BadArgument();
6013 return NULL;
6014 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006015 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006016 return NULL;
6017 kind = PyUnicode_KIND(unicode);
6018 data = PyUnicode_DATA(unicode);
6019 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006020 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6021 bytes, and 1 byte characters 4. */
6022 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006023
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006024 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006026
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006027 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028 if (repr == NULL)
6029 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006030 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006031 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006033 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006034 for (pos = 0; pos < len; pos++) {
6035 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 /* Map 32-bit characters to '\Uxxxxxxxx' */
6037 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006038 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006039 *p++ = '\\';
6040 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006041 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6042 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6043 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6044 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6045 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6046 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6047 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6048 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006049 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006051 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 *p++ = '\\';
6053 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006054 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6055 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6056 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6057 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006059 /* Copy everything else as-is */
6060 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061 *p++ = (char) ch;
6062 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006063
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006064 assert(p > q);
6065 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006066 return NULL;
6067 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068}
6069
Alexander Belopolsky40018472011-02-26 01:02:56 +00006070PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006071PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6072 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006074 PyObject *result;
6075 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6076 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006077 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006078 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6079 Py_DECREF(tmp);
6080 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006081}
6082
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006083/* --- Unicode Internal Codec ------------------------------------------- */
6084
Alexander Belopolsky40018472011-02-26 01:02:56 +00006085PyObject *
6086_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006087 Py_ssize_t size,
6088 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006089{
6090 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006091 Py_ssize_t startinpos;
6092 Py_ssize_t endinpos;
6093 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006094 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006095 const char *end;
6096 const char *reason;
6097 PyObject *errorHandler = NULL;
6098 PyObject *exc = NULL;
6099
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006100 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006101 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006102 1))
6103 return NULL;
6104
Thomas Wouters89f507f2006-12-13 04:49:30 +00006105 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006106 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006107 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006108 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006109 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006110 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006111 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006112 end = s + size;
6113
6114 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006115 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006116 Py_UCS4 ch;
6117 /* We copy the raw representation one byte at a time because the
6118 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006119 ((char *) &uch)[0] = s[0];
6120 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006121#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006122 ((char *) &uch)[2] = s[2];
6123 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006124#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006125 ch = uch;
6126
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006127 /* We have to sanity check the raw data, otherwise doom looms for
6128 some malformed UCS-4 data. */
6129 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006130#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006131 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006132#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006133 end-s < Py_UNICODE_SIZE
6134 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006136 startinpos = s - starts;
6137 if (end-s < Py_UNICODE_SIZE) {
6138 endinpos = end-starts;
6139 reason = "truncated input";
6140 }
6141 else {
6142 endinpos = s - starts + Py_UNICODE_SIZE;
6143 reason = "illegal code point (> 0x10FFFF)";
6144 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006145 if (unicode_decode_call_errorhandler(
6146 errors, &errorHandler,
6147 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006148 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006149 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006150 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006151 continue;
6152 }
6153
6154 s += Py_UNICODE_SIZE;
6155#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006156 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006157 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006158 Py_UNICODE uch2;
6159 ((char *) &uch2)[0] = s[0];
6160 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006161 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006162 {
Victor Stinner551ac952011-11-29 22:58:13 +01006163 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006164 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006165 }
6166 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006167#endif
6168
6169 if (unicode_putchar(&v, &outpos, ch) < 0)
6170 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006171 }
6172
Victor Stinner16e6a802011-12-12 13:24:15 +01006173 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006174 goto onError;
6175 Py_XDECREF(errorHandler);
6176 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006177 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006178
Benjamin Peterson29060642009-01-31 22:14:21 +00006179 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006180 Py_XDECREF(v);
6181 Py_XDECREF(errorHandler);
6182 Py_XDECREF(exc);
6183 return NULL;
6184}
6185
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186/* --- Latin-1 Codec ------------------------------------------------------ */
6187
Alexander Belopolsky40018472011-02-26 01:02:56 +00006188PyObject *
6189PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006190 Py_ssize_t size,
6191 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006194 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195}
6196
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006197/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006198static void
6199make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006200 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006201 PyObject *unicode,
6202 Py_ssize_t startpos, Py_ssize_t endpos,
6203 const char *reason)
6204{
6205 if (*exceptionObject == NULL) {
6206 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006207 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006208 encoding, unicode, startpos, endpos, reason);
6209 }
6210 else {
6211 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6212 goto onError;
6213 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6214 goto onError;
6215 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6216 goto onError;
6217 return;
6218 onError:
6219 Py_DECREF(*exceptionObject);
6220 *exceptionObject = NULL;
6221 }
6222}
6223
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006224/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006225static void
6226raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006227 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006228 PyObject *unicode,
6229 Py_ssize_t startpos, Py_ssize_t endpos,
6230 const char *reason)
6231{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006232 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006233 encoding, unicode, startpos, endpos, reason);
6234 if (*exceptionObject != NULL)
6235 PyCodec_StrictErrors(*exceptionObject);
6236}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006237
6238/* error handling callback helper:
6239 build arguments, call the callback and check the arguments,
6240 put the result into newpos and return the replacement string, which
6241 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006242static PyObject *
6243unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006244 PyObject **errorHandler,
6245 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006246 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006247 Py_ssize_t startpos, Py_ssize_t endpos,
6248 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006249{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006250 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006251 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006252 PyObject *restuple;
6253 PyObject *resunicode;
6254
6255 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006257 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006258 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006259 }
6260
Benjamin Petersonbac79492012-01-14 13:34:47 -05006261 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006262 return NULL;
6263 len = PyUnicode_GET_LENGTH(unicode);
6264
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006265 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006266 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006267 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006268 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006269
6270 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006272 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006274 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006275 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 Py_DECREF(restuple);
6277 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006278 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006279 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 &resunicode, newpos)) {
6281 Py_DECREF(restuple);
6282 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006283 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006284 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6285 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6286 Py_DECREF(restuple);
6287 return NULL;
6288 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006289 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006290 *newpos = len + *newpos;
6291 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6293 Py_DECREF(restuple);
6294 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006295 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006296 Py_INCREF(resunicode);
6297 Py_DECREF(restuple);
6298 return resunicode;
6299}
6300
Alexander Belopolsky40018472011-02-26 01:02:56 +00006301static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006302unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006303 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006304 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006305{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006306 /* input state */
6307 Py_ssize_t pos=0, size;
6308 int kind;
6309 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006310 /* output object */
6311 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006312 /* pointer into the output */
6313 char *str;
6314 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006315 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006316 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6317 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006318 PyObject *errorHandler = NULL;
6319 PyObject *exc = NULL;
6320 /* the following variable is used for caching string comparisons
6321 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6322 int known_errorHandler = -1;
6323
Benjamin Petersonbac79492012-01-14 13:34:47 -05006324 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006325 return NULL;
6326 size = PyUnicode_GET_LENGTH(unicode);
6327 kind = PyUnicode_KIND(unicode);
6328 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006329 /* allocate enough for a simple encoding without
6330 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006331 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006332 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006333 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006334 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006335 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006336 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006337 ressize = size;
6338
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006339 while (pos < size) {
6340 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006341
Benjamin Peterson29060642009-01-31 22:14:21 +00006342 /* can we encode this? */
6343 if (c<limit) {
6344 /* no overflow check, because we know that the space is enough */
6345 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006346 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006347 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006348 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 Py_ssize_t requiredsize;
6350 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006351 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006352 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006353 Py_ssize_t collstart = pos;
6354 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006356 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 ++collend;
6358 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6359 if (known_errorHandler==-1) {
6360 if ((errors==NULL) || (!strcmp(errors, "strict")))
6361 known_errorHandler = 1;
6362 else if (!strcmp(errors, "replace"))
6363 known_errorHandler = 2;
6364 else if (!strcmp(errors, "ignore"))
6365 known_errorHandler = 3;
6366 else if (!strcmp(errors, "xmlcharrefreplace"))
6367 known_errorHandler = 4;
6368 else
6369 known_errorHandler = 0;
6370 }
6371 switch (known_errorHandler) {
6372 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006373 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 goto onError;
6375 case 2: /* replace */
6376 while (collstart++<collend)
6377 *str++ = '?'; /* fall through */
6378 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006379 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 break;
6381 case 4: /* xmlcharrefreplace */
6382 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006383 /* determine replacement size */
6384 for (i = collstart, repsize = 0; i < collend; ++i) {
6385 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6386 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006387 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006388 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006390 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006391 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006392 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006393 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006394 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006395 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006396 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006398 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006399 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006400 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006401 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006403 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 if (requiredsize > ressize) {
6405 if (requiredsize<2*ressize)
6406 requiredsize = 2*ressize;
6407 if (_PyBytes_Resize(&res, requiredsize))
6408 goto onError;
6409 str = PyBytes_AS_STRING(res) + respos;
6410 ressize = requiredsize;
6411 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006412 /* generate replacement */
6413 for (i = collstart; i < collend; ++i) {
6414 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006416 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006417 break;
6418 default:
6419 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006420 encoding, reason, unicode, &exc,
6421 collstart, collend, &newpos);
6422 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006423 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006425 if (PyBytes_Check(repunicode)) {
6426 /* Directly copy bytes result to output. */
6427 repsize = PyBytes_Size(repunicode);
6428 if (repsize > 1) {
6429 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006430 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006431 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6432 Py_DECREF(repunicode);
6433 goto onError;
6434 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006435 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006436 ressize += repsize-1;
6437 }
6438 memcpy(str, PyBytes_AsString(repunicode), repsize);
6439 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006440 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006441 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006442 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006443 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006444 /* need more space? (at least enough for what we
6445 have+the replacement+the rest of the string, so
6446 we won't have to check space for encodable characters) */
6447 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006448 repsize = PyUnicode_GET_LENGTH(repunicode);
6449 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006450 if (requiredsize > ressize) {
6451 if (requiredsize<2*ressize)
6452 requiredsize = 2*ressize;
6453 if (_PyBytes_Resize(&res, requiredsize)) {
6454 Py_DECREF(repunicode);
6455 goto onError;
6456 }
6457 str = PyBytes_AS_STRING(res) + respos;
6458 ressize = requiredsize;
6459 }
6460 /* check if there is anything unencodable in the replacement
6461 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006462 for (i = 0; repsize-->0; ++i, ++str) {
6463 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006465 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006466 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 Py_DECREF(repunicode);
6468 goto onError;
6469 }
6470 *str = (char)c;
6471 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006472 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006473 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006474 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006475 }
6476 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006477 /* Resize if we allocated to much */
6478 size = str - PyBytes_AS_STRING(res);
6479 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006480 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006481 if (_PyBytes_Resize(&res, size) < 0)
6482 goto onError;
6483 }
6484
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006485 Py_XDECREF(errorHandler);
6486 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006487 return res;
6488
6489 onError:
6490 Py_XDECREF(res);
6491 Py_XDECREF(errorHandler);
6492 Py_XDECREF(exc);
6493 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006494}
6495
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006496/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006497PyObject *
6498PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006499 Py_ssize_t size,
6500 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006502 PyObject *result;
6503 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6504 if (unicode == NULL)
6505 return NULL;
6506 result = unicode_encode_ucs1(unicode, errors, 256);
6507 Py_DECREF(unicode);
6508 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509}
6510
Alexander Belopolsky40018472011-02-26 01:02:56 +00006511PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006512_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513{
6514 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 PyErr_BadArgument();
6516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006518 if (PyUnicode_READY(unicode) == -1)
6519 return NULL;
6520 /* Fast path: if it is a one-byte string, construct
6521 bytes object directly. */
6522 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6523 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6524 PyUnicode_GET_LENGTH(unicode));
6525 /* Non-Latin-1 characters present. Defer to above function to
6526 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006527 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006528}
6529
6530PyObject*
6531PyUnicode_AsLatin1String(PyObject *unicode)
6532{
6533 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534}
6535
6536/* --- 7-bit ASCII Codec -------------------------------------------------- */
6537
Alexander Belopolsky40018472011-02-26 01:02:56 +00006538PyObject *
6539PyUnicode_DecodeASCII(const char *s,
6540 Py_ssize_t size,
6541 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006543 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006544 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006545 int kind;
6546 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006547 Py_ssize_t startinpos;
6548 Py_ssize_t endinpos;
6549 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006550 const char *e;
6551 PyObject *errorHandler = NULL;
6552 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006553
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006554 if (size == 0) {
6555 Py_INCREF(unicode_empty);
6556 return unicode_empty;
6557 }
6558
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006560 if (size == 1 && (unsigned char)s[0] < 128)
6561 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006562
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006563 unicode = PyUnicode_New(size, 127);
6564 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006566
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006567 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006568 data = PyUnicode_1BYTE_DATA(unicode);
6569 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6570 if (outpos == size)
6571 return unicode;
6572
6573 s += outpos;
6574 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006575 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 register unsigned char c = (unsigned char)*s;
6577 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006578 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 ++s;
6580 }
6581 else {
6582 startinpos = s-starts;
6583 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006584 if (unicode_decode_call_errorhandler(
6585 errors, &errorHandler,
6586 "ascii", "ordinal not in range(128)",
6587 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006588 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006589 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006590 kind = PyUnicode_KIND(unicode);
6591 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006593 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006594 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006595 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006596 Py_XDECREF(errorHandler);
6597 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006598 assert(_PyUnicode_CheckConsistency(unicode, 1));
6599 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006600
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006602 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006603 Py_XDECREF(errorHandler);
6604 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006605 return NULL;
6606}
6607
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006608/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006609PyObject *
6610PyUnicode_EncodeASCII(const Py_UNICODE *p,
6611 Py_ssize_t size,
6612 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006614 PyObject *result;
6615 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6616 if (unicode == NULL)
6617 return NULL;
6618 result = unicode_encode_ucs1(unicode, errors, 128);
6619 Py_DECREF(unicode);
6620 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006621}
6622
Alexander Belopolsky40018472011-02-26 01:02:56 +00006623PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006624_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006625{
6626 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 PyErr_BadArgument();
6628 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006630 if (PyUnicode_READY(unicode) == -1)
6631 return NULL;
6632 /* Fast path: if it is an ASCII-only string, construct bytes object
6633 directly. Else defer to above function to raise the exception. */
6634 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6635 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6636 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006637 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006638}
6639
6640PyObject *
6641PyUnicode_AsASCIIString(PyObject *unicode)
6642{
6643 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644}
6645
Victor Stinner99b95382011-07-04 14:23:54 +02006646#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006647
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006648/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006649
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006650#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006651#define NEED_RETRY
6652#endif
6653
Victor Stinner3a50e702011-10-18 21:21:00 +02006654#ifndef WC_ERR_INVALID_CHARS
6655# define WC_ERR_INVALID_CHARS 0x0080
6656#endif
6657
6658static char*
6659code_page_name(UINT code_page, PyObject **obj)
6660{
6661 *obj = NULL;
6662 if (code_page == CP_ACP)
6663 return "mbcs";
6664 if (code_page == CP_UTF7)
6665 return "CP_UTF7";
6666 if (code_page == CP_UTF8)
6667 return "CP_UTF8";
6668
6669 *obj = PyBytes_FromFormat("cp%u", code_page);
6670 if (*obj == NULL)
6671 return NULL;
6672 return PyBytes_AS_STRING(*obj);
6673}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006674
Alexander Belopolsky40018472011-02-26 01:02:56 +00006675static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006676is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006677{
6678 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006679 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006680
Victor Stinner3a50e702011-10-18 21:21:00 +02006681 if (!IsDBCSLeadByteEx(code_page, *curr))
6682 return 0;
6683
6684 prev = CharPrevExA(code_page, s, curr, 0);
6685 if (prev == curr)
6686 return 1;
6687 /* FIXME: This code is limited to "true" double-byte encodings,
6688 as it assumes an incomplete character consists of a single
6689 byte. */
6690 if (curr - prev == 2)
6691 return 1;
6692 if (!IsDBCSLeadByteEx(code_page, *prev))
6693 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006694 return 0;
6695}
6696
Victor Stinner3a50e702011-10-18 21:21:00 +02006697static DWORD
6698decode_code_page_flags(UINT code_page)
6699{
6700 if (code_page == CP_UTF7) {
6701 /* The CP_UTF7 decoder only supports flags=0 */
6702 return 0;
6703 }
6704 else
6705 return MB_ERR_INVALID_CHARS;
6706}
6707
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006708/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006709 * Decode a byte string from a Windows code page into unicode object in strict
6710 * mode.
6711 *
6712 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6713 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006714 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006715static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006716decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006717 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006718 const char *in,
6719 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006720{
Victor Stinner3a50e702011-10-18 21:21:00 +02006721 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006722 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006723 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006724
6725 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006726 assert(insize > 0);
6727 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6728 if (outsize <= 0)
6729 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006730
6731 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006732 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006733 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006734 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 if (*v == NULL)
6736 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006737 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006738 }
6739 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006741 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006742 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006744 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006745 }
6746
6747 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006748 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6749 if (outsize <= 0)
6750 goto error;
6751 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006752
Victor Stinner3a50e702011-10-18 21:21:00 +02006753error:
6754 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6755 return -2;
6756 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006757 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006758}
6759
Victor Stinner3a50e702011-10-18 21:21:00 +02006760/*
6761 * Decode a byte string from a code page into unicode object with an error
6762 * handler.
6763 *
6764 * Returns consumed size if succeed, or raise a WindowsError or
6765 * UnicodeDecodeError exception and returns -1 on error.
6766 */
6767static int
6768decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006769 PyObject **v,
6770 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006771 const char *errors)
6772{
6773 const char *startin = in;
6774 const char *endin = in + size;
6775 const DWORD flags = decode_code_page_flags(code_page);
6776 /* Ideally, we should get reason from FormatMessage. This is the Windows
6777 2000 English version of the message. */
6778 const char *reason = "No mapping for the Unicode character exists "
6779 "in the target code page.";
6780 /* each step cannot decode more than 1 character, but a character can be
6781 represented as a surrogate pair */
6782 wchar_t buffer[2], *startout, *out;
6783 int insize, outsize;
6784 PyObject *errorHandler = NULL;
6785 PyObject *exc = NULL;
6786 PyObject *encoding_obj = NULL;
6787 char *encoding;
6788 DWORD err;
6789 int ret = -1;
6790
6791 assert(size > 0);
6792
6793 encoding = code_page_name(code_page, &encoding_obj);
6794 if (encoding == NULL)
6795 return -1;
6796
6797 if (errors == NULL || strcmp(errors, "strict") == 0) {
6798 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6799 UnicodeDecodeError. */
6800 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6801 if (exc != NULL) {
6802 PyCodec_StrictErrors(exc);
6803 Py_CLEAR(exc);
6804 }
6805 goto error;
6806 }
6807
6808 if (*v == NULL) {
6809 /* Create unicode object */
6810 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6811 PyErr_NoMemory();
6812 goto error;
6813 }
Victor Stinnerab595942011-12-17 04:59:06 +01006814 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006815 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006816 if (*v == NULL)
6817 goto error;
6818 startout = PyUnicode_AS_UNICODE(*v);
6819 }
6820 else {
6821 /* Extend unicode object */
6822 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6823 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6824 PyErr_NoMemory();
6825 goto error;
6826 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006827 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006828 goto error;
6829 startout = PyUnicode_AS_UNICODE(*v) + n;
6830 }
6831
6832 /* Decode the byte string character per character */
6833 out = startout;
6834 while (in < endin)
6835 {
6836 /* Decode a character */
6837 insize = 1;
6838 do
6839 {
6840 outsize = MultiByteToWideChar(code_page, flags,
6841 in, insize,
6842 buffer, Py_ARRAY_LENGTH(buffer));
6843 if (outsize > 0)
6844 break;
6845 err = GetLastError();
6846 if (err != ERROR_NO_UNICODE_TRANSLATION
6847 && err != ERROR_INSUFFICIENT_BUFFER)
6848 {
6849 PyErr_SetFromWindowsErr(0);
6850 goto error;
6851 }
6852 insize++;
6853 }
6854 /* 4=maximum length of a UTF-8 sequence */
6855 while (insize <= 4 && (in + insize) <= endin);
6856
6857 if (outsize <= 0) {
6858 Py_ssize_t startinpos, endinpos, outpos;
6859
6860 startinpos = in - startin;
6861 endinpos = startinpos + 1;
6862 outpos = out - PyUnicode_AS_UNICODE(*v);
6863 if (unicode_decode_call_errorhandler(
6864 errors, &errorHandler,
6865 encoding, reason,
6866 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006867 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006868 {
6869 goto error;
6870 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006871 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006872 }
6873 else {
6874 in += insize;
6875 memcpy(out, buffer, outsize * sizeof(wchar_t));
6876 out += outsize;
6877 }
6878 }
6879
6880 /* write a NUL character at the end */
6881 *out = 0;
6882
6883 /* Extend unicode object */
6884 outsize = out - startout;
6885 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006886 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006887 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006888 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006889
6890error:
6891 Py_XDECREF(encoding_obj);
6892 Py_XDECREF(errorHandler);
6893 Py_XDECREF(exc);
6894 return ret;
6895}
6896
Victor Stinner3a50e702011-10-18 21:21:00 +02006897static PyObject *
6898decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006899 const char *s, Py_ssize_t size,
6900 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006901{
Victor Stinner76a31a62011-11-04 00:05:13 +01006902 PyObject *v = NULL;
6903 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006904
Victor Stinner3a50e702011-10-18 21:21:00 +02006905 if (code_page < 0) {
6906 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6907 return NULL;
6908 }
6909
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006910 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006912
Victor Stinner76a31a62011-11-04 00:05:13 +01006913 do
6914 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006915#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006916 if (size > INT_MAX) {
6917 chunk_size = INT_MAX;
6918 final = 0;
6919 done = 0;
6920 }
6921 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006922#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006923 {
6924 chunk_size = (int)size;
6925 final = (consumed == NULL);
6926 done = 1;
6927 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006928
Victor Stinner76a31a62011-11-04 00:05:13 +01006929 /* Skip trailing lead-byte unless 'final' is set */
6930 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6931 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006932
Victor Stinner76a31a62011-11-04 00:05:13 +01006933 if (chunk_size == 0 && done) {
6934 if (v != NULL)
6935 break;
6936 Py_INCREF(unicode_empty);
6937 return unicode_empty;
6938 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006939
Victor Stinner76a31a62011-11-04 00:05:13 +01006940
6941 converted = decode_code_page_strict(code_page, &v,
6942 s, chunk_size);
6943 if (converted == -2)
6944 converted = decode_code_page_errors(code_page, &v,
6945 s, chunk_size,
6946 errors);
6947 assert(converted != 0);
6948
6949 if (converted < 0) {
6950 Py_XDECREF(v);
6951 return NULL;
6952 }
6953
6954 if (consumed)
6955 *consumed += converted;
6956
6957 s += converted;
6958 size -= converted;
6959 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006960
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006961 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006962}
6963
Alexander Belopolsky40018472011-02-26 01:02:56 +00006964PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006965PyUnicode_DecodeCodePageStateful(int code_page,
6966 const char *s,
6967 Py_ssize_t size,
6968 const char *errors,
6969 Py_ssize_t *consumed)
6970{
6971 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6972}
6973
6974PyObject *
6975PyUnicode_DecodeMBCSStateful(const char *s,
6976 Py_ssize_t size,
6977 const char *errors,
6978 Py_ssize_t *consumed)
6979{
6980 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6981}
6982
6983PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006984PyUnicode_DecodeMBCS(const char *s,
6985 Py_ssize_t size,
6986 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006987{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006988 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6989}
6990
Victor Stinner3a50e702011-10-18 21:21:00 +02006991static DWORD
6992encode_code_page_flags(UINT code_page, const char *errors)
6993{
6994 if (code_page == CP_UTF8) {
6995 if (winver.dwMajorVersion >= 6)
6996 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6997 and later */
6998 return WC_ERR_INVALID_CHARS;
6999 else
7000 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7001 return 0;
7002 }
7003 else if (code_page == CP_UTF7) {
7004 /* CP_UTF7 only supports flags=0 */
7005 return 0;
7006 }
7007 else {
7008 if (errors != NULL && strcmp(errors, "replace") == 0)
7009 return 0;
7010 else
7011 return WC_NO_BEST_FIT_CHARS;
7012 }
7013}
7014
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007015/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007016 * Encode a Unicode string to a Windows code page into a byte string in strict
7017 * mode.
7018 *
7019 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7020 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007021 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007022static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007023encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007024 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007025 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007026{
Victor Stinner554f3f02010-06-16 23:33:54 +00007027 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007028 BOOL *pusedDefaultChar = &usedDefaultChar;
7029 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007030 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007031 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007032 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007033 const DWORD flags = encode_code_page_flags(code_page, NULL);
7034 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007035 /* Create a substring so that we can get the UTF-16 representation
7036 of just the slice under consideration. */
7037 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007038
Martin v. Löwis3d325192011-11-04 18:23:06 +01007039 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007040
Victor Stinner3a50e702011-10-18 21:21:00 +02007041 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007042 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007043 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007044 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007045
Victor Stinner2fc507f2011-11-04 20:06:39 +01007046 substring = PyUnicode_Substring(unicode, offset, offset+len);
7047 if (substring == NULL)
7048 return -1;
7049 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7050 if (p == NULL) {
7051 Py_DECREF(substring);
7052 return -1;
7053 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007054
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007055 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007056 outsize = WideCharToMultiByte(code_page, flags,
7057 p, size,
7058 NULL, 0,
7059 NULL, pusedDefaultChar);
7060 if (outsize <= 0)
7061 goto error;
7062 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007063 if (pusedDefaultChar && *pusedDefaultChar) {
7064 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007065 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007066 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007067
Victor Stinner3a50e702011-10-18 21:21:00 +02007068 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007070 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007071 if (*outbytes == NULL) {
7072 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007073 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007074 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007075 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007076 }
7077 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007078 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007079 const Py_ssize_t n = PyBytes_Size(*outbytes);
7080 if (outsize > PY_SSIZE_T_MAX - n) {
7081 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007082 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007083 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007084 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007085 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7086 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007087 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007088 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007089 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007090 }
7091
7092 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007093 outsize = WideCharToMultiByte(code_page, flags,
7094 p, size,
7095 out, outsize,
7096 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007097 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007098 if (outsize <= 0)
7099 goto error;
7100 if (pusedDefaultChar && *pusedDefaultChar)
7101 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007102 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007103
Victor Stinner3a50e702011-10-18 21:21:00 +02007104error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007105 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007106 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7107 return -2;
7108 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007109 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007110}
7111
Victor Stinner3a50e702011-10-18 21:21:00 +02007112/*
7113 * Encode a Unicode string to a Windows code page into a byte string using a
7114 * error handler.
7115 *
7116 * Returns consumed characters if succeed, or raise a WindowsError and returns
7117 * -1 on other error.
7118 */
7119static int
7120encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007121 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007122 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007123{
Victor Stinner3a50e702011-10-18 21:21:00 +02007124 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007125 Py_ssize_t pos = unicode_offset;
7126 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007127 /* Ideally, we should get reason from FormatMessage. This is the Windows
7128 2000 English version of the message. */
7129 const char *reason = "invalid character";
7130 /* 4=maximum length of a UTF-8 sequence */
7131 char buffer[4];
7132 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7133 Py_ssize_t outsize;
7134 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 PyObject *errorHandler = NULL;
7136 PyObject *exc = NULL;
7137 PyObject *encoding_obj = NULL;
7138 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007139 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007140 PyObject *rep;
7141 int ret = -1;
7142
7143 assert(insize > 0);
7144
7145 encoding = code_page_name(code_page, &encoding_obj);
7146 if (encoding == NULL)
7147 return -1;
7148
7149 if (errors == NULL || strcmp(errors, "strict") == 0) {
7150 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7151 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007152 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007153 if (exc != NULL) {
7154 PyCodec_StrictErrors(exc);
7155 Py_DECREF(exc);
7156 }
7157 Py_XDECREF(encoding_obj);
7158 return -1;
7159 }
7160
7161 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7162 pusedDefaultChar = &usedDefaultChar;
7163 else
7164 pusedDefaultChar = NULL;
7165
7166 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7167 PyErr_NoMemory();
7168 goto error;
7169 }
7170 outsize = insize * Py_ARRAY_LENGTH(buffer);
7171
7172 if (*outbytes == NULL) {
7173 /* Create string object */
7174 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7175 if (*outbytes == NULL)
7176 goto error;
7177 out = PyBytes_AS_STRING(*outbytes);
7178 }
7179 else {
7180 /* Extend string object */
7181 Py_ssize_t n = PyBytes_Size(*outbytes);
7182 if (n > PY_SSIZE_T_MAX - outsize) {
7183 PyErr_NoMemory();
7184 goto error;
7185 }
7186 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7187 goto error;
7188 out = PyBytes_AS_STRING(*outbytes) + n;
7189 }
7190
7191 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007192 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007193 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007194 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7195 wchar_t chars[2];
7196 int charsize;
7197 if (ch < 0x10000) {
7198 chars[0] = (wchar_t)ch;
7199 charsize = 1;
7200 }
7201 else {
7202 ch -= 0x10000;
7203 chars[0] = 0xd800 + (ch >> 10);
7204 chars[1] = 0xdc00 + (ch & 0x3ff);
7205 charsize = 2;
7206 }
7207
Victor Stinner3a50e702011-10-18 21:21:00 +02007208 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007209 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007210 buffer, Py_ARRAY_LENGTH(buffer),
7211 NULL, pusedDefaultChar);
7212 if (outsize > 0) {
7213 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7214 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007215 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 memcpy(out, buffer, outsize);
7217 out += outsize;
7218 continue;
7219 }
7220 }
7221 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7222 PyErr_SetFromWindowsErr(0);
7223 goto error;
7224 }
7225
Victor Stinner3a50e702011-10-18 21:21:00 +02007226 rep = unicode_encode_call_errorhandler(
7227 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007228 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007229 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 if (rep == NULL)
7231 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007232 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007233
7234 if (PyBytes_Check(rep)) {
7235 outsize = PyBytes_GET_SIZE(rep);
7236 if (outsize != 1) {
7237 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7238 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7239 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7240 Py_DECREF(rep);
7241 goto error;
7242 }
7243 out = PyBytes_AS_STRING(*outbytes) + offset;
7244 }
7245 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7246 out += outsize;
7247 }
7248 else {
7249 Py_ssize_t i;
7250 enum PyUnicode_Kind kind;
7251 void *data;
7252
Benjamin Petersonbac79492012-01-14 13:34:47 -05007253 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007254 Py_DECREF(rep);
7255 goto error;
7256 }
7257
7258 outsize = PyUnicode_GET_LENGTH(rep);
7259 if (outsize != 1) {
7260 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7261 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7262 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7263 Py_DECREF(rep);
7264 goto error;
7265 }
7266 out = PyBytes_AS_STRING(*outbytes) + offset;
7267 }
7268 kind = PyUnicode_KIND(rep);
7269 data = PyUnicode_DATA(rep);
7270 for (i=0; i < outsize; i++) {
7271 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7272 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007273 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007274 encoding, unicode,
7275 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 "unable to encode error handler result to ASCII");
7277 Py_DECREF(rep);
7278 goto error;
7279 }
7280 *out = (unsigned char)ch;
7281 out++;
7282 }
7283 }
7284 Py_DECREF(rep);
7285 }
7286 /* write a NUL byte */
7287 *out = 0;
7288 outsize = out - PyBytes_AS_STRING(*outbytes);
7289 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7290 if (_PyBytes_Resize(outbytes, outsize) < 0)
7291 goto error;
7292 ret = 0;
7293
7294error:
7295 Py_XDECREF(encoding_obj);
7296 Py_XDECREF(errorHandler);
7297 Py_XDECREF(exc);
7298 return ret;
7299}
7300
Victor Stinner3a50e702011-10-18 21:21:00 +02007301static PyObject *
7302encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007303 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 const char *errors)
7305{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007306 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007307 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007308 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007309 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007310
Benjamin Petersonbac79492012-01-14 13:34:47 -05007311 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007312 return NULL;
7313 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007314
Victor Stinner3a50e702011-10-18 21:21:00 +02007315 if (code_page < 0) {
7316 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7317 return NULL;
7318 }
7319
Martin v. Löwis3d325192011-11-04 18:23:06 +01007320 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007321 return PyBytes_FromStringAndSize(NULL, 0);
7322
Victor Stinner7581cef2011-11-03 22:32:33 +01007323 offset = 0;
7324 do
7325 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007326#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007327 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007328 chunks. */
7329 if (len > INT_MAX/2) {
7330 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007331 done = 0;
7332 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007333 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007334#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007335 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007336 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007337 done = 1;
7338 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007339
Victor Stinner76a31a62011-11-04 00:05:13 +01007340 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007341 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007342 errors);
7343 if (ret == -2)
7344 ret = encode_code_page_errors(code_page, &outbytes,
7345 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007346 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007347 if (ret < 0) {
7348 Py_XDECREF(outbytes);
7349 return NULL;
7350 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007351
Victor Stinner7581cef2011-11-03 22:32:33 +01007352 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007353 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007354 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007355
Victor Stinner3a50e702011-10-18 21:21:00 +02007356 return outbytes;
7357}
7358
7359PyObject *
7360PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7361 Py_ssize_t size,
7362 const char *errors)
7363{
Victor Stinner7581cef2011-11-03 22:32:33 +01007364 PyObject *unicode, *res;
7365 unicode = PyUnicode_FromUnicode(p, size);
7366 if (unicode == NULL)
7367 return NULL;
7368 res = encode_code_page(CP_ACP, unicode, errors);
7369 Py_DECREF(unicode);
7370 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007371}
7372
7373PyObject *
7374PyUnicode_EncodeCodePage(int code_page,
7375 PyObject *unicode,
7376 const char *errors)
7377{
Victor Stinner7581cef2011-11-03 22:32:33 +01007378 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007379}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007380
Alexander Belopolsky40018472011-02-26 01:02:56 +00007381PyObject *
7382PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007383{
7384 if (!PyUnicode_Check(unicode)) {
7385 PyErr_BadArgument();
7386 return NULL;
7387 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007388 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007389}
7390
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007391#undef NEED_RETRY
7392
Victor Stinner99b95382011-07-04 14:23:54 +02007393#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007394
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395/* --- Character Mapping Codec -------------------------------------------- */
7396
Alexander Belopolsky40018472011-02-26 01:02:56 +00007397PyObject *
7398PyUnicode_DecodeCharmap(const char *s,
7399 Py_ssize_t size,
7400 PyObject *mapping,
7401 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007403 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007404 Py_ssize_t startinpos;
7405 Py_ssize_t endinpos;
7406 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007407 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007408 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007409 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007410 PyObject *errorHandler = NULL;
7411 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007412
Guido van Rossumd57fd912000-03-10 22:53:23 +00007413 /* Default to Latin-1 */
7414 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007415 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007417 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007419 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007421 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007422 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007423 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007424 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007425 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007426 enum PyUnicode_Kind mapkind;
7427 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007428 Py_UCS4 x;
7429
Benjamin Petersonbac79492012-01-14 13:34:47 -05007430 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007431 return NULL;
7432
7433 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007434 mapdata = PyUnicode_DATA(mapping);
7435 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007436 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007437 unsigned char ch;
7438 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7439 enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7440 if (outkind == PyUnicode_1BYTE_KIND) {
7441 void *outdata = PyUnicode_DATA(v);
7442 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7443 while (s < e) {
7444 unsigned char ch = *s;
7445 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7446 if (x > maxchar)
7447 goto Error;
7448 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7449 ++s;
7450 }
7451 break;
7452 }
7453 else if (outkind == PyUnicode_2BYTE_KIND) {
7454 void *outdata = PyUnicode_DATA(v);
7455 while (s < e) {
7456 unsigned char ch = *s;
7457 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7458 if (x == 0xFFFE)
7459 goto Error;
7460 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7461 ++s;
7462 }
7463 break;
7464 }
7465 }
7466 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007467
Benjamin Peterson29060642009-01-31 22:14:21 +00007468 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007469 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007470 else
7471 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007472Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007473 if (x == 0xfffe)
7474 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007475 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 startinpos = s-starts;
7477 endinpos = startinpos+1;
7478 if (unicode_decode_call_errorhandler(
7479 errors, &errorHandler,
7480 "charmap", "character maps to <undefined>",
7481 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007482 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007483 goto onError;
7484 }
7485 continue;
7486 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007487
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007488 if (unicode_putchar(&v, &outpos, x) < 0)
7489 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007491 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007492 }
7493 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007494 while (s < e) {
7495 unsigned char ch = *s;
7496 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007497
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7499 w = PyLong_FromLong((long)ch);
7500 if (w == NULL)
7501 goto onError;
7502 x = PyObject_GetItem(mapping, w);
7503 Py_DECREF(w);
7504 if (x == NULL) {
7505 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7506 /* No mapping found means: mapping is undefined. */
7507 PyErr_Clear();
7508 x = Py_None;
7509 Py_INCREF(x);
7510 } else
7511 goto onError;
7512 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007513
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 /* Apply mapping */
7515 if (PyLong_Check(x)) {
7516 long value = PyLong_AS_LONG(x);
Antoine Pitroua1f76552012-09-23 20:00:04 +02007517 if (value < 0 || value > MAX_UNICODE) {
7518 PyErr_Format(PyExc_TypeError,
7519 "character mapping must be in range(0x%lx)",
7520 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007521 Py_DECREF(x);
7522 goto onError;
7523 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007524 if (unicode_putchar(&v, &outpos, value) < 0)
7525 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 }
7527 else if (x == Py_None) {
7528 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007529 startinpos = s-starts;
7530 endinpos = startinpos+1;
7531 if (unicode_decode_call_errorhandler(
7532 errors, &errorHandler,
7533 "charmap", "character maps to <undefined>",
7534 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007535 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007536 Py_DECREF(x);
7537 goto onError;
7538 }
7539 Py_DECREF(x);
7540 continue;
7541 }
7542 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007543 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007544
Benjamin Petersonbac79492012-01-14 13:34:47 -05007545 if (PyUnicode_READY(x) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007546 goto onError;
7547 targetsize = PyUnicode_GET_LENGTH(x);
7548
7549 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007550 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007551 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007552 PyUnicode_READ_CHAR(x, 0)) < 0)
7553 goto onError;
7554 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 else if (targetsize > 1) {
7556 /* 1-n mapping */
7557 if (targetsize > extrachars) {
7558 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007559 Py_ssize_t needed = (targetsize - extrachars) + \
7560 (targetsize << 2);
7561 extrachars += needed;
7562 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007563 if (unicode_resize(&v,
7564 PyUnicode_GET_LENGTH(v) + needed) < 0)
7565 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007566 Py_DECREF(x);
7567 goto onError;
7568 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007569 }
Victor Stinner1b487b42012-05-03 12:29:04 +02007570 if (unicode_widen(&v, outpos, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007571 goto onError;
7572 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7573 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007574 extrachars -= targetsize;
7575 }
7576 /* 1-0 mapping: skip the character */
7577 }
7578 else {
7579 /* wrong return value */
7580 PyErr_SetString(PyExc_TypeError,
7581 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007582 Py_DECREF(x);
7583 goto onError;
7584 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 Py_DECREF(x);
7586 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007588 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007589 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007590 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007591 Py_XDECREF(errorHandler);
7592 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007593 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007594
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007596 Py_XDECREF(errorHandler);
7597 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007598 Py_XDECREF(v);
7599 return NULL;
7600}
7601
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007602/* Charmap encoding: the lookup table */
7603
Alexander Belopolsky40018472011-02-26 01:02:56 +00007604struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007605 PyObject_HEAD
7606 unsigned char level1[32];
7607 int count2, count3;
7608 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007609};
7610
7611static PyObject*
7612encoding_map_size(PyObject *obj, PyObject* args)
7613{
7614 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007615 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007616 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007617}
7618
7619static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007620 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007621 PyDoc_STR("Return the size (in bytes) of this object") },
7622 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007623};
7624
7625static void
7626encoding_map_dealloc(PyObject* o)
7627{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007628 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007629}
7630
7631static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007632 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007633 "EncodingMap", /*tp_name*/
7634 sizeof(struct encoding_map), /*tp_basicsize*/
7635 0, /*tp_itemsize*/
7636 /* methods */
7637 encoding_map_dealloc, /*tp_dealloc*/
7638 0, /*tp_print*/
7639 0, /*tp_getattr*/
7640 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007641 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007642 0, /*tp_repr*/
7643 0, /*tp_as_number*/
7644 0, /*tp_as_sequence*/
7645 0, /*tp_as_mapping*/
7646 0, /*tp_hash*/
7647 0, /*tp_call*/
7648 0, /*tp_str*/
7649 0, /*tp_getattro*/
7650 0, /*tp_setattro*/
7651 0, /*tp_as_buffer*/
7652 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7653 0, /*tp_doc*/
7654 0, /*tp_traverse*/
7655 0, /*tp_clear*/
7656 0, /*tp_richcompare*/
7657 0, /*tp_weaklistoffset*/
7658 0, /*tp_iter*/
7659 0, /*tp_iternext*/
7660 encoding_map_methods, /*tp_methods*/
7661 0, /*tp_members*/
7662 0, /*tp_getset*/
7663 0, /*tp_base*/
7664 0, /*tp_dict*/
7665 0, /*tp_descr_get*/
7666 0, /*tp_descr_set*/
7667 0, /*tp_dictoffset*/
7668 0, /*tp_init*/
7669 0, /*tp_alloc*/
7670 0, /*tp_new*/
7671 0, /*tp_free*/
7672 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007673};
7674
7675PyObject*
7676PyUnicode_BuildEncodingMap(PyObject* string)
7677{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007678 PyObject *result;
7679 struct encoding_map *mresult;
7680 int i;
7681 int need_dict = 0;
7682 unsigned char level1[32];
7683 unsigned char level2[512];
7684 unsigned char *mlevel1, *mlevel2, *mlevel3;
7685 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007686 int kind;
7687 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007688 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007689 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007690
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007691 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007692 PyErr_BadArgument();
7693 return NULL;
7694 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007695 kind = PyUnicode_KIND(string);
7696 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007697 length = PyUnicode_GET_LENGTH(string);
7698 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007699 memset(level1, 0xFF, sizeof level1);
7700 memset(level2, 0xFF, sizeof level2);
7701
7702 /* If there isn't a one-to-one mapping of NULL to \0,
7703 or if there are non-BMP characters, we need to use
7704 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007705 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007706 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007707 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007708 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007709 ch = PyUnicode_READ(kind, data, i);
7710 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007711 need_dict = 1;
7712 break;
7713 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007714 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007715 /* unmapped character */
7716 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007717 l1 = ch >> 11;
7718 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007719 if (level1[l1] == 0xFF)
7720 level1[l1] = count2++;
7721 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007722 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007723 }
7724
7725 if (count2 >= 0xFF || count3 >= 0xFF)
7726 need_dict = 1;
7727
7728 if (need_dict) {
7729 PyObject *result = PyDict_New();
7730 PyObject *key, *value;
7731 if (!result)
7732 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007733 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007734 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007735 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007736 if (!key || !value)
7737 goto failed1;
7738 if (PyDict_SetItem(result, key, value) == -1)
7739 goto failed1;
7740 Py_DECREF(key);
7741 Py_DECREF(value);
7742 }
7743 return result;
7744 failed1:
7745 Py_XDECREF(key);
7746 Py_XDECREF(value);
7747 Py_DECREF(result);
7748 return NULL;
7749 }
7750
7751 /* Create a three-level trie */
7752 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7753 16*count2 + 128*count3 - 1);
7754 if (!result)
7755 return PyErr_NoMemory();
7756 PyObject_Init(result, &EncodingMapType);
7757 mresult = (struct encoding_map*)result;
7758 mresult->count2 = count2;
7759 mresult->count3 = count3;
7760 mlevel1 = mresult->level1;
7761 mlevel2 = mresult->level23;
7762 mlevel3 = mresult->level23 + 16*count2;
7763 memcpy(mlevel1, level1, 32);
7764 memset(mlevel2, 0xFF, 16*count2);
7765 memset(mlevel3, 0, 128*count3);
7766 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007767 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007768 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007769 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7770 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007771 /* unmapped character */
7772 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007773 o1 = ch>>11;
7774 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007775 i2 = 16*mlevel1[o1] + o2;
7776 if (mlevel2[i2] == 0xFF)
7777 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007778 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007779 i3 = 128*mlevel2[i2] + o3;
7780 mlevel3[i3] = i;
7781 }
7782 return result;
7783}
7784
7785static int
Victor Stinner22168992011-11-20 17:09:18 +01007786encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007787{
7788 struct encoding_map *map = (struct encoding_map*)mapping;
7789 int l1 = c>>11;
7790 int l2 = (c>>7) & 0xF;
7791 int l3 = c & 0x7F;
7792 int i;
7793
Victor Stinner22168992011-11-20 17:09:18 +01007794 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007796 if (c == 0)
7797 return 0;
7798 /* level 1*/
7799 i = map->level1[l1];
7800 if (i == 0xFF) {
7801 return -1;
7802 }
7803 /* level 2*/
7804 i = map->level23[16*i+l2];
7805 if (i == 0xFF) {
7806 return -1;
7807 }
7808 /* level 3 */
7809 i = map->level23[16*map->count2 + 128*i + l3];
7810 if (i == 0) {
7811 return -1;
7812 }
7813 return i;
7814}
7815
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007816/* Lookup the character ch in the mapping. If the character
7817 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007818 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007819static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007820charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007821{
Christian Heimes217cfd12007-12-02 14:31:20 +00007822 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007823 PyObject *x;
7824
7825 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007826 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007827 x = PyObject_GetItem(mapping, w);
7828 Py_DECREF(w);
7829 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007830 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7831 /* No mapping found means: mapping is undefined. */
7832 PyErr_Clear();
7833 x = Py_None;
7834 Py_INCREF(x);
7835 return x;
7836 } else
7837 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007838 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007839 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007841 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 long value = PyLong_AS_LONG(x);
7843 if (value < 0 || value > 255) {
7844 PyErr_SetString(PyExc_TypeError,
7845 "character mapping must be in range(256)");
7846 Py_DECREF(x);
7847 return NULL;
7848 }
7849 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007850 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007851 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007854 /* wrong return value */
7855 PyErr_Format(PyExc_TypeError,
7856 "character mapping must return integer, bytes or None, not %.400s",
7857 x->ob_type->tp_name);
7858 Py_DECREF(x);
7859 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860 }
7861}
7862
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007863static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007864charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007865{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007866 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7867 /* exponentially overallocate to minimize reallocations */
7868 if (requiredsize < 2*outsize)
7869 requiredsize = 2*outsize;
7870 if (_PyBytes_Resize(outobj, requiredsize))
7871 return -1;
7872 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007873}
7874
Benjamin Peterson14339b62009-01-31 16:36:08 +00007875typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007876 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007877} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007878/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007879 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007880 space is available. Return a new reference to the object that
7881 was put in the output buffer, or Py_None, if the mapping was undefined
7882 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007883 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007884static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007885charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007886 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007887{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007888 PyObject *rep;
7889 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007890 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007891
Christian Heimes90aa7642007-12-19 02:45:37 +00007892 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007893 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007894 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007895 if (res == -1)
7896 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 if (outsize<requiredsize)
7898 if (charmapencode_resize(outobj, outpos, requiredsize))
7899 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007900 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 outstart[(*outpos)++] = (char)res;
7902 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007903 }
7904
7905 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007906 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007908 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007909 Py_DECREF(rep);
7910 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007911 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007912 if (PyLong_Check(rep)) {
7913 Py_ssize_t requiredsize = *outpos+1;
7914 if (outsize<requiredsize)
7915 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7916 Py_DECREF(rep);
7917 return enc_EXCEPTION;
7918 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007919 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007921 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007922 else {
7923 const char *repchars = PyBytes_AS_STRING(rep);
7924 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7925 Py_ssize_t requiredsize = *outpos+repsize;
7926 if (outsize<requiredsize)
7927 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7928 Py_DECREF(rep);
7929 return enc_EXCEPTION;
7930 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007931 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007932 memcpy(outstart + *outpos, repchars, repsize);
7933 *outpos += repsize;
7934 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007935 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007936 Py_DECREF(rep);
7937 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007938}
7939
7940/* handle an error in PyUnicode_EncodeCharmap
7941 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007942static int
7943charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007944 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007945 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007946 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007947 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007948{
7949 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007950 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007951 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007952 enum PyUnicode_Kind kind;
7953 void *data;
7954 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007955 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007956 Py_ssize_t collstartpos = *inpos;
7957 Py_ssize_t collendpos = *inpos+1;
7958 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007959 char *encoding = "charmap";
7960 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007961 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007962 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007963 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007964
Benjamin Petersonbac79492012-01-14 13:34:47 -05007965 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007966 return -1;
7967 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007968 /* find all unencodable characters */
7969 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007970 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007971 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007972 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007973 val = encoding_map_lookup(ch, mapping);
7974 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 break;
7976 ++collendpos;
7977 continue;
7978 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007979
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007980 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7981 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 if (rep==NULL)
7983 return -1;
7984 else if (rep!=Py_None) {
7985 Py_DECREF(rep);
7986 break;
7987 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007988 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007990 }
7991 /* cache callback name lookup
7992 * (if not done yet, i.e. it's the first error) */
7993 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007994 if ((errors==NULL) || (!strcmp(errors, "strict")))
7995 *known_errorHandler = 1;
7996 else if (!strcmp(errors, "replace"))
7997 *known_errorHandler = 2;
7998 else if (!strcmp(errors, "ignore"))
7999 *known_errorHandler = 3;
8000 else if (!strcmp(errors, "xmlcharrefreplace"))
8001 *known_errorHandler = 4;
8002 else
8003 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008004 }
8005 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008006 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008007 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008008 return -1;
8009 case 2: /* replace */
8010 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 x = charmapencode_output('?', mapping, res, respos);
8012 if (x==enc_EXCEPTION) {
8013 return -1;
8014 }
8015 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008016 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008017 return -1;
8018 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008019 }
8020 /* fall through */
8021 case 3: /* ignore */
8022 *inpos = collendpos;
8023 break;
8024 case 4: /* xmlcharrefreplace */
8025 /* generate replacement (temporarily (mis)uses p) */
8026 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 char buffer[2+29+1+1];
8028 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008029 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 for (cp = buffer; *cp; ++cp) {
8031 x = charmapencode_output(*cp, mapping, res, respos);
8032 if (x==enc_EXCEPTION)
8033 return -1;
8034 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008035 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 return -1;
8037 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008038 }
8039 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008040 *inpos = collendpos;
8041 break;
8042 default:
8043 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008044 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008046 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008048 if (PyBytes_Check(repunicode)) {
8049 /* Directly copy bytes result to output. */
8050 Py_ssize_t outsize = PyBytes_Size(*res);
8051 Py_ssize_t requiredsize;
8052 repsize = PyBytes_Size(repunicode);
8053 requiredsize = *respos + repsize;
8054 if (requiredsize > outsize)
8055 /* Make room for all additional bytes. */
8056 if (charmapencode_resize(res, respos, requiredsize)) {
8057 Py_DECREF(repunicode);
8058 return -1;
8059 }
8060 memcpy(PyBytes_AsString(*res) + *respos,
8061 PyBytes_AsString(repunicode), repsize);
8062 *respos += repsize;
8063 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008064 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008065 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008066 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008067 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008068 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008069 Py_DECREF(repunicode);
8070 return -1;
8071 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008072 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008073 data = PyUnicode_DATA(repunicode);
8074 kind = PyUnicode_KIND(repunicode);
8075 for (index = 0; index < repsize; index++) {
8076 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8077 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008079 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008080 return -1;
8081 }
8082 else if (x==enc_FAILED) {
8083 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008084 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 return -1;
8086 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008087 }
8088 *inpos = newpos;
8089 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008090 }
8091 return 0;
8092}
8093
Alexander Belopolsky40018472011-02-26 01:02:56 +00008094PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008095_PyUnicode_EncodeCharmap(PyObject *unicode,
8096 PyObject *mapping,
8097 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008098{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008099 /* output object */
8100 PyObject *res = NULL;
8101 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008102 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008103 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008105 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008106 PyObject *errorHandler = NULL;
8107 PyObject *exc = NULL;
8108 /* the following variable is used for caching string comparisons
8109 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8110 * 3=ignore, 4=xmlcharrefreplace */
8111 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008112
Benjamin Petersonbac79492012-01-14 13:34:47 -05008113 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008114 return NULL;
8115 size = PyUnicode_GET_LENGTH(unicode);
8116
Guido van Rossumd57fd912000-03-10 22:53:23 +00008117 /* Default to Latin-1 */
8118 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008119 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008120
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121 /* allocate enough for a simple encoding without
8122 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008123 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008124 if (res == NULL)
8125 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008126 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008129 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008130 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008132 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 if (x==enc_EXCEPTION) /* error */
8134 goto onError;
8135 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008136 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 &exc,
8138 &known_errorHandler, &errorHandler, errors,
8139 &res, &respos)) {
8140 goto onError;
8141 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008142 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008143 else
8144 /* done with this character => adjust input position */
8145 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008148 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008149 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008150 if (_PyBytes_Resize(&res, respos) < 0)
8151 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008152
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008153 Py_XDECREF(exc);
8154 Py_XDECREF(errorHandler);
8155 return res;
8156
Benjamin Peterson29060642009-01-31 22:14:21 +00008157 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008158 Py_XDECREF(res);
8159 Py_XDECREF(exc);
8160 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008161 return NULL;
8162}
8163
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008164/* Deprecated */
8165PyObject *
8166PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8167 Py_ssize_t size,
8168 PyObject *mapping,
8169 const char *errors)
8170{
8171 PyObject *result;
8172 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8173 if (unicode == NULL)
8174 return NULL;
8175 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8176 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008177 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008178}
8179
Alexander Belopolsky40018472011-02-26 01:02:56 +00008180PyObject *
8181PyUnicode_AsCharmapString(PyObject *unicode,
8182 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183{
8184 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 PyErr_BadArgument();
8186 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008188 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008189}
8190
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008191/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008192static void
8193make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008194 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008195 Py_ssize_t startpos, Py_ssize_t endpos,
8196 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008197{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008199 *exceptionObject = _PyUnicodeTranslateError_Create(
8200 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008201 }
8202 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008203 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8204 goto onError;
8205 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8206 goto onError;
8207 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8208 goto onError;
8209 return;
8210 onError:
8211 Py_DECREF(*exceptionObject);
8212 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008213 }
8214}
8215
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008216/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008217static void
8218raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008219 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008220 Py_ssize_t startpos, Py_ssize_t endpos,
8221 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008222{
8223 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008224 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008225 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008226 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008227}
8228
8229/* error handling callback helper:
8230 build arguments, call the callback and check the arguments,
8231 put the result into newpos and return the replacement string, which
8232 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008233static PyObject *
8234unicode_translate_call_errorhandler(const char *errors,
8235 PyObject **errorHandler,
8236 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008237 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008238 Py_ssize_t startpos, Py_ssize_t endpos,
8239 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008241 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008242
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008243 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008244 PyObject *restuple;
8245 PyObject *resunicode;
8246
8247 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251 }
8252
8253 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008254 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008255 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008257
8258 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008263 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 Py_DECREF(restuple);
8265 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008266 }
8267 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 &resunicode, &i_newpos)) {
8269 Py_DECREF(restuple);
8270 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008271 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008272 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008273 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008274 else
8275 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008276 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8278 Py_DECREF(restuple);
8279 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008280 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008281 Py_INCREF(resunicode);
8282 Py_DECREF(restuple);
8283 return resunicode;
8284}
8285
8286/* Lookup the character ch in the mapping and put the result in result,
8287 which must be decrefed by the caller.
8288 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008289static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008290charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008291{
Christian Heimes217cfd12007-12-02 14:31:20 +00008292 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008293 PyObject *x;
8294
8295 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008297 x = PyObject_GetItem(mapping, w);
8298 Py_DECREF(w);
8299 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008300 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8301 /* No mapping found means: use 1:1 mapping. */
8302 PyErr_Clear();
8303 *result = NULL;
8304 return 0;
8305 } else
8306 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008307 }
8308 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 *result = x;
8310 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008311 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008312 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 long value = PyLong_AS_LONG(x);
8314 long max = PyUnicode_GetMax();
8315 if (value < 0 || value > max) {
8316 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008317 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008318 Py_DECREF(x);
8319 return -1;
8320 }
8321 *result = x;
8322 return 0;
8323 }
8324 else if (PyUnicode_Check(x)) {
8325 *result = x;
8326 return 0;
8327 }
8328 else {
8329 /* wrong return value */
8330 PyErr_SetString(PyExc_TypeError,
8331 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008332 Py_DECREF(x);
8333 return -1;
8334 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008335}
8336/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 if not reallocate and adjust various state variables.
8338 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008339static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008340charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008341 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008342{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008343 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008344 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008345 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008346 /* exponentially overallocate to minimize reallocations */
8347 if (requiredsize < 2 * oldsize)
8348 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008349 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8350 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008352 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008353 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008354 }
8355 return 0;
8356}
8357/* lookup the character, put the result in the output string and adjust
8358 various state variables. Return a new reference to the object that
8359 was put in the output buffer in *result, or Py_None, if the mapping was
8360 undefined (in which case no character was written).
8361 The called must decref result.
8362 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008363static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008364charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8365 PyObject *mapping, Py_UCS4 **output,
8366 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008367 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008369 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8370 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008375 }
8376 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008378 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008379 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008380 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 }
8382 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008383 Py_ssize_t repsize;
8384 if (PyUnicode_READY(*res) == -1)
8385 return -1;
8386 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 if (repsize==1) {
8388 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008389 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 }
8391 else if (repsize!=0) {
8392 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008393 Py_ssize_t requiredsize = *opos +
8394 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008396 Py_ssize_t i;
8397 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008399 for(i = 0; i < repsize; i++)
8400 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008402 }
8403 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008405 return 0;
8406}
8407
Alexander Belopolsky40018472011-02-26 01:02:56 +00008408PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008409_PyUnicode_TranslateCharmap(PyObject *input,
8410 PyObject *mapping,
8411 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008413 /* input object */
8414 char *idata;
8415 Py_ssize_t size, i;
8416 int kind;
8417 /* output buffer */
8418 Py_UCS4 *output = NULL;
8419 Py_ssize_t osize;
8420 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008422 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008423 char *reason = "character maps to <undefined>";
8424 PyObject *errorHandler = NULL;
8425 PyObject *exc = NULL;
8426 /* the following variable is used for caching string comparisons
8427 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8428 * 3=ignore, 4=xmlcharrefreplace */
8429 int known_errorHandler = -1;
8430
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 PyErr_BadArgument();
8433 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008434 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008436 if (PyUnicode_READY(input) == -1)
8437 return NULL;
8438 idata = (char*)PyUnicode_DATA(input);
8439 kind = PyUnicode_KIND(input);
8440 size = PyUnicode_GET_LENGTH(input);
8441 i = 0;
8442
8443 if (size == 0) {
8444 Py_INCREF(input);
8445 return input;
8446 }
8447
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008448 /* allocate enough for a simple 1:1 translation without
8449 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008450 osize = size;
8451 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8452 opos = 0;
8453 if (output == NULL) {
8454 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008456 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008458 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 /* try to encode it */
8460 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008461 if (charmaptranslate_output(input, i, mapping,
8462 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 Py_XDECREF(x);
8464 goto onError;
8465 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008466 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 else { /* untranslatable character */
8470 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8471 Py_ssize_t repsize;
8472 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008475 Py_ssize_t collstart = i;
8476 Py_ssize_t collend = i+1;
8477 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008478
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008480 while (collend < size) {
8481 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 goto onError;
8483 Py_XDECREF(x);
8484 if (x!=Py_None)
8485 break;
8486 ++collend;
8487 }
8488 /* cache callback name lookup
8489 * (if not done yet, i.e. it's the first error) */
8490 if (known_errorHandler==-1) {
8491 if ((errors==NULL) || (!strcmp(errors, "strict")))
8492 known_errorHandler = 1;
8493 else if (!strcmp(errors, "replace"))
8494 known_errorHandler = 2;
8495 else if (!strcmp(errors, "ignore"))
8496 known_errorHandler = 3;
8497 else if (!strcmp(errors, "xmlcharrefreplace"))
8498 known_errorHandler = 4;
8499 else
8500 known_errorHandler = 0;
8501 }
8502 switch (known_errorHandler) {
8503 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008504 raise_translate_exception(&exc, input, collstart,
8505 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008506 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 case 2: /* replace */
8508 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 for (coll = collstart; coll<collend; coll++)
8510 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 /* fall through */
8512 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008513 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 break;
8515 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008516 /* generate replacement (temporarily (mis)uses i) */
8517 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 char buffer[2+29+1+1];
8519 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008520 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8521 if (charmaptranslate_makespace(&output, &osize,
8522 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 goto onError;
8524 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008525 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 break;
8529 default:
8530 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 reason, input, &exc,
8532 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008533 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008535 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008536 Py_DECREF(repunicode);
8537 goto onError;
8538 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 repsize = PyUnicode_GET_LENGTH(repunicode);
8541 if (charmaptranslate_makespace(&output, &osize,
8542 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 Py_DECREF(repunicode);
8544 goto onError;
8545 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546 for (uni2 = 0; repsize-->0; ++uni2)
8547 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8548 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008550 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008551 }
8552 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008553 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8554 if (!res)
8555 goto onError;
8556 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557 Py_XDECREF(exc);
8558 Py_XDECREF(errorHandler);
8559 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008560
Benjamin Peterson29060642009-01-31 22:14:21 +00008561 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008562 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008563 Py_XDECREF(exc);
8564 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565 return NULL;
8566}
8567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008568/* Deprecated. Use PyUnicode_Translate instead. */
8569PyObject *
8570PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8571 Py_ssize_t size,
8572 PyObject *mapping,
8573 const char *errors)
8574{
Christian Heimes5f520f42012-09-11 14:03:25 +02008575 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008576 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8577 if (!unicode)
8578 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008579 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8580 Py_DECREF(unicode);
8581 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008582}
8583
Alexander Belopolsky40018472011-02-26 01:02:56 +00008584PyObject *
8585PyUnicode_Translate(PyObject *str,
8586 PyObject *mapping,
8587 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588{
8589 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008590
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591 str = PyUnicode_FromObject(str);
8592 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008593 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008594 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595 Py_DECREF(str);
8596 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008597}
Tim Petersced69f82003-09-16 20:30:58 +00008598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008600fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601{
8602 /* No need to call PyUnicode_READY(self) because this function is only
8603 called as a callback from fixup() which does it already. */
8604 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8605 const int kind = PyUnicode_KIND(self);
8606 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008607 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008608 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008609 Py_ssize_t i;
8610
8611 for (i = 0; i < len; ++i) {
8612 ch = PyUnicode_READ(kind, data, i);
8613 fixed = 0;
8614 if (ch > 127) {
8615 if (Py_UNICODE_ISSPACE(ch))
8616 fixed = ' ';
8617 else {
8618 const int decimal = Py_UNICODE_TODECIMAL(ch);
8619 if (decimal >= 0)
8620 fixed = '0' + decimal;
8621 }
8622 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008623 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008624 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625 PyUnicode_WRITE(kind, data, i, fixed);
8626 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008627 else
8628 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 }
8631
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008632 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633}
8634
8635PyObject *
8636_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8637{
8638 if (!PyUnicode_Check(unicode)) {
8639 PyErr_BadInternalCall();
8640 return NULL;
8641 }
8642 if (PyUnicode_READY(unicode) == -1)
8643 return NULL;
8644 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8645 /* If the string is already ASCII, just return the same string */
8646 Py_INCREF(unicode);
8647 return unicode;
8648 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008649 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650}
8651
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008652PyObject *
8653PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8654 Py_ssize_t length)
8655{
Victor Stinnerf0124502011-11-21 23:12:56 +01008656 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008657 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008658 Py_UCS4 maxchar;
8659 enum PyUnicode_Kind kind;
8660 void *data;
8661
Victor Stinner99d7ad02012-02-22 13:37:39 +01008662 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008663 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008664 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008665 if (ch > 127) {
8666 int decimal = Py_UNICODE_TODECIMAL(ch);
8667 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008668 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008669 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008670 }
8671 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008672
8673 /* Copy to a new string */
8674 decimal = PyUnicode_New(length, maxchar);
8675 if (decimal == NULL)
8676 return decimal;
8677 kind = PyUnicode_KIND(decimal);
8678 data = PyUnicode_DATA(decimal);
8679 /* Iterate over code points */
8680 for (i = 0; i < length; i++) {
8681 Py_UNICODE ch = s[i];
8682 if (ch > 127) {
8683 int decimal = Py_UNICODE_TODECIMAL(ch);
8684 if (decimal >= 0)
8685 ch = '0' + decimal;
8686 }
8687 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008688 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008689 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008690}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008691/* --- Decimal Encoder ---------------------------------------------------- */
8692
Alexander Belopolsky40018472011-02-26 01:02:56 +00008693int
8694PyUnicode_EncodeDecimal(Py_UNICODE *s,
8695 Py_ssize_t length,
8696 char *output,
8697 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008698{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008699 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008700 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008701 enum PyUnicode_Kind kind;
8702 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008703
8704 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 PyErr_BadArgument();
8706 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008707 }
8708
Victor Stinner42bf7752011-11-21 22:52:58 +01008709 unicode = PyUnicode_FromUnicode(s, length);
8710 if (unicode == NULL)
8711 return -1;
8712
Benjamin Petersonbac79492012-01-14 13:34:47 -05008713 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008714 Py_DECREF(unicode);
8715 return -1;
8716 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008717 kind = PyUnicode_KIND(unicode);
8718 data = PyUnicode_DATA(unicode);
8719
Victor Stinnerb84d7232011-11-22 01:50:07 +01008720 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008721 PyObject *exc;
8722 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008724 Py_ssize_t startpos;
8725
8726 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008727
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008729 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008730 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008732 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 decimal = Py_UNICODE_TODECIMAL(ch);
8734 if (decimal >= 0) {
8735 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008736 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 continue;
8738 }
8739 if (0 < ch && ch < 256) {
8740 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008741 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008742 continue;
8743 }
Victor Stinner6345be92011-11-25 20:09:01 +01008744
Victor Stinner42bf7752011-11-21 22:52:58 +01008745 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008746 exc = NULL;
8747 raise_encode_exception(&exc, "decimal", unicode,
8748 startpos, startpos+1,
8749 "invalid decimal Unicode string");
8750 Py_XDECREF(exc);
8751 Py_DECREF(unicode);
8752 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008753 }
8754 /* 0-terminate the output string */
8755 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008756 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008757 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008758}
8759
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760/* --- Helpers ------------------------------------------------------------ */
8761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008762static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008763any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764 Py_ssize_t start,
8765 Py_ssize_t end)
8766{
8767 int kind1, kind2, kind;
8768 void *buf1, *buf2;
8769 Py_ssize_t len1, len2, result;
8770
8771 kind1 = PyUnicode_KIND(s1);
8772 kind2 = PyUnicode_KIND(s2);
8773 kind = kind1 > kind2 ? kind1 : kind2;
8774 buf1 = PyUnicode_DATA(s1);
8775 buf2 = PyUnicode_DATA(s2);
8776 if (kind1 != kind)
8777 buf1 = _PyUnicode_AsKind(s1, kind);
8778 if (!buf1)
8779 return -2;
8780 if (kind2 != kind)
8781 buf2 = _PyUnicode_AsKind(s2, kind);
8782 if (!buf2) {
8783 if (kind1 != kind) PyMem_Free(buf1);
8784 return -2;
8785 }
8786 len1 = PyUnicode_GET_LENGTH(s1);
8787 len2 = PyUnicode_GET_LENGTH(s2);
8788
Victor Stinner794d5672011-10-10 03:21:36 +02008789 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008790 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008791 case PyUnicode_1BYTE_KIND:
8792 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8793 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8794 else
8795 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8796 break;
8797 case PyUnicode_2BYTE_KIND:
8798 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8799 break;
8800 case PyUnicode_4BYTE_KIND:
8801 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8802 break;
8803 default:
8804 assert(0); result = -2;
8805 }
8806 }
8807 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008808 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008809 case PyUnicode_1BYTE_KIND:
8810 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8811 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8812 else
8813 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8814 break;
8815 case PyUnicode_2BYTE_KIND:
8816 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8817 break;
8818 case PyUnicode_4BYTE_KIND:
8819 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8820 break;
8821 default:
8822 assert(0); result = -2;
8823 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824 }
8825
8826 if (kind1 != kind)
8827 PyMem_Free(buf1);
8828 if (kind2 != kind)
8829 PyMem_Free(buf2);
8830
8831 return result;
8832}
8833
8834Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008835_PyUnicode_InsertThousandsGrouping(
8836 PyObject *unicode, Py_ssize_t index,
8837 Py_ssize_t n_buffer,
8838 void *digits, Py_ssize_t n_digits,
8839 Py_ssize_t min_width,
8840 const char *grouping, PyObject *thousands_sep,
8841 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008842{
Victor Stinner41a863c2012-02-24 00:37:51 +01008843 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008844 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008845 Py_ssize_t thousands_sep_len;
8846 Py_ssize_t len;
8847
8848 if (unicode != NULL) {
8849 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008850 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008851 }
8852 else {
8853 kind = PyUnicode_1BYTE_KIND;
8854 data = NULL;
8855 }
8856 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8857 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8858 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8859 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008860 if (thousands_sep_kind < kind) {
8861 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8862 if (!thousands_sep_data)
8863 return -1;
8864 }
8865 else {
8866 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8867 if (!data)
8868 return -1;
8869 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008870 }
8871
Benjamin Petersonead6b532011-12-20 17:23:42 -06008872 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008873 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008874 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008875 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008876 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008877 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008878 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008879 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008880 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008881 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008882 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008883 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008884 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008886 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008887 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008888 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008889 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008890 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008891 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008892 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008893 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008894 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008895 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008896 break;
8897 default:
8898 assert(0);
8899 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008901 if (unicode != NULL && thousands_sep_kind != kind) {
8902 if (thousands_sep_kind < kind)
8903 PyMem_Free(thousands_sep_data);
8904 else
8905 PyMem_Free(data);
8906 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008907 if (unicode == NULL) {
8908 *maxchar = 127;
8909 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008910 *maxchar = MAX_MAXCHAR(*maxchar,
8911 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008912 }
8913 }
8914 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008915}
8916
8917
Thomas Wouters477c8d52006-05-27 19:21:47 +00008918/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008919#define ADJUST_INDICES(start, end, len) \
8920 if (end > len) \
8921 end = len; \
8922 else if (end < 0) { \
8923 end += len; \
8924 if (end < 0) \
8925 end = 0; \
8926 } \
8927 if (start < 0) { \
8928 start += len; \
8929 if (start < 0) \
8930 start = 0; \
8931 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008932
Alexander Belopolsky40018472011-02-26 01:02:56 +00008933Py_ssize_t
8934PyUnicode_Count(PyObject *str,
8935 PyObject *substr,
8936 Py_ssize_t start,
8937 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008938{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008939 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008940 PyObject* str_obj;
8941 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 int kind1, kind2, kind;
8943 void *buf1 = NULL, *buf2 = NULL;
8944 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008945
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008946 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008947 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008948 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008949 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008950 if (!sub_obj) {
8951 Py_DECREF(str_obj);
8952 return -1;
8953 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008954 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008955 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008956 Py_DECREF(str_obj);
8957 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958 }
Tim Petersced69f82003-09-16 20:30:58 +00008959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 kind1 = PyUnicode_KIND(str_obj);
8961 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008962 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008963 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008965 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008966 if (kind2 > kind) {
8967 Py_DECREF(sub_obj);
8968 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008969 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008970 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008971 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 if (!buf2)
8974 goto onError;
8975 len1 = PyUnicode_GET_LENGTH(str_obj);
8976 len2 = PyUnicode_GET_LENGTH(sub_obj);
8977
8978 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008979 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008981 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8982 result = asciilib_count(
8983 ((Py_UCS1*)buf1) + start, end - start,
8984 buf2, len2, PY_SSIZE_T_MAX
8985 );
8986 else
8987 result = ucs1lib_count(
8988 ((Py_UCS1*)buf1) + start, end - start,
8989 buf2, len2, PY_SSIZE_T_MAX
8990 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991 break;
8992 case PyUnicode_2BYTE_KIND:
8993 result = ucs2lib_count(
8994 ((Py_UCS2*)buf1) + start, end - start,
8995 buf2, len2, PY_SSIZE_T_MAX
8996 );
8997 break;
8998 case PyUnicode_4BYTE_KIND:
8999 result = ucs4lib_count(
9000 ((Py_UCS4*)buf1) + start, end - start,
9001 buf2, len2, PY_SSIZE_T_MAX
9002 );
9003 break;
9004 default:
9005 assert(0); result = 0;
9006 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009007
9008 Py_DECREF(sub_obj);
9009 Py_DECREF(str_obj);
9010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 if (kind2 != kind)
9012 PyMem_Free(buf2);
9013
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009015 onError:
9016 Py_DECREF(sub_obj);
9017 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018 if (kind2 != kind && buf2)
9019 PyMem_Free(buf2);
9020 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021}
9022
Alexander Belopolsky40018472011-02-26 01:02:56 +00009023Py_ssize_t
9024PyUnicode_Find(PyObject *str,
9025 PyObject *sub,
9026 Py_ssize_t start,
9027 Py_ssize_t end,
9028 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009030 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009031
Guido van Rossumd57fd912000-03-10 22:53:23 +00009032 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009033 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009034 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009035 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009036 if (!sub) {
9037 Py_DECREF(str);
9038 return -2;
9039 }
9040 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9041 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009042 Py_DECREF(str);
9043 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044 }
Tim Petersced69f82003-09-16 20:30:58 +00009045
Victor Stinner794d5672011-10-10 03:21:36 +02009046 result = any_find_slice(direction,
9047 str, sub, start, end
9048 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009049
Guido van Rossumd57fd912000-03-10 22:53:23 +00009050 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009051 Py_DECREF(sub);
9052
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053 return result;
9054}
9055
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056Py_ssize_t
9057PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9058 Py_ssize_t start, Py_ssize_t end,
9059 int direction)
9060{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009062 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009063 if (PyUnicode_READY(str) == -1)
9064 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009065 if (start < 0 || end < 0) {
9066 PyErr_SetString(PyExc_IndexError, "string index out of range");
9067 return -2;
9068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069 if (end > PyUnicode_GET_LENGTH(str))
9070 end = PyUnicode_GET_LENGTH(str);
9071 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009072 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9073 kind, end-start, ch, direction);
9074 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009075 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009076 else
9077 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078}
9079
Alexander Belopolsky40018472011-02-26 01:02:56 +00009080static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009081tailmatch(PyObject *self,
9082 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009083 Py_ssize_t start,
9084 Py_ssize_t end,
9085 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009086{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009087 int kind_self;
9088 int kind_sub;
9089 void *data_self;
9090 void *data_sub;
9091 Py_ssize_t offset;
9092 Py_ssize_t i;
9093 Py_ssize_t end_sub;
9094
9095 if (PyUnicode_READY(self) == -1 ||
9096 PyUnicode_READY(substring) == -1)
9097 return 0;
9098
9099 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009100 return 1;
9101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9103 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009105 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009107 kind_self = PyUnicode_KIND(self);
9108 data_self = PyUnicode_DATA(self);
9109 kind_sub = PyUnicode_KIND(substring);
9110 data_sub = PyUnicode_DATA(substring);
9111 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9112
9113 if (direction > 0)
9114 offset = end;
9115 else
9116 offset = start;
9117
9118 if (PyUnicode_READ(kind_self, data_self, offset) ==
9119 PyUnicode_READ(kind_sub, data_sub, 0) &&
9120 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9121 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9122 /* If both are of the same kind, memcmp is sufficient */
9123 if (kind_self == kind_sub) {
9124 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009125 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 data_sub,
9127 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009128 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009129 }
9130 /* otherwise we have to compare each character by first accesing it */
9131 else {
9132 /* We do not need to compare 0 and len(substring)-1 because
9133 the if statement above ensured already that they are equal
9134 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02009135 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136 for (i = 1; i < end_sub; ++i) {
9137 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9138 PyUnicode_READ(kind_sub, data_sub, i))
9139 return 0;
9140 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009141 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009143 }
9144
9145 return 0;
9146}
9147
Alexander Belopolsky40018472011-02-26 01:02:56 +00009148Py_ssize_t
9149PyUnicode_Tailmatch(PyObject *str,
9150 PyObject *substr,
9151 Py_ssize_t start,
9152 Py_ssize_t end,
9153 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009155 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009156
Guido van Rossumd57fd912000-03-10 22:53:23 +00009157 str = PyUnicode_FromObject(str);
9158 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009160 substr = PyUnicode_FromObject(substr);
9161 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009162 Py_DECREF(str);
9163 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009164 }
Tim Petersced69f82003-09-16 20:30:58 +00009165
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009166 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009168 Py_DECREF(str);
9169 Py_DECREF(substr);
9170 return result;
9171}
9172
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173/* Apply fixfct filter to the Unicode object self and return a
9174 reference to the modified object */
9175
Alexander Belopolsky40018472011-02-26 01:02:56 +00009176static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009177fixup(PyObject *self,
9178 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009179{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009180 PyObject *u;
9181 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009182 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009184 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009185 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009186 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009187 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009189 /* fix functions return the new maximum character in a string,
9190 if the kind of the resulting unicode object does not change,
9191 everything is fine. Otherwise we need to change the string kind
9192 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009193 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009194
9195 if (maxchar_new == 0) {
9196 /* no changes */;
9197 if (PyUnicode_CheckExact(self)) {
9198 Py_DECREF(u);
9199 Py_INCREF(self);
9200 return self;
9201 }
9202 else
9203 return u;
9204 }
9205
Victor Stinnere6abb482012-05-02 01:15:40 +02009206 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009207
Victor Stinnereaab6042011-12-11 22:22:39 +01009208 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009209 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009210
9211 /* In case the maximum character changed, we need to
9212 convert the string to the new category. */
9213 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9214 if (v == NULL) {
9215 Py_DECREF(u);
9216 return NULL;
9217 }
9218 if (maxchar_new > maxchar_old) {
9219 /* If the maxchar increased so that the kind changed, not all
9220 characters are representable anymore and we need to fix the
9221 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009222 _PyUnicode_FastCopyCharacters(v, 0,
9223 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009224 maxchar_old = fixfct(v);
9225 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226 }
9227 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009228 _PyUnicode_FastCopyCharacters(v, 0,
9229 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009231 Py_DECREF(u);
9232 assert(_PyUnicode_CheckConsistency(v, 1));
9233 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234}
9235
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009236static PyObject *
9237ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009239 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9240 char *resdata, *data = PyUnicode_DATA(self);
9241 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009242
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009243 res = PyUnicode_New(len, 127);
9244 if (res == NULL)
9245 return NULL;
9246 resdata = PyUnicode_DATA(res);
9247 if (lower)
9248 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009249 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009250 _Py_bytes_upper(resdata, data, len);
9251 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009252}
9253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009255handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009256{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009257 Py_ssize_t j;
9258 int final_sigma;
9259 Py_UCS4 c;
9260 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009261
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009262 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9263
9264 where ! is a negation and \p{xxx} is a character with property xxx.
9265 */
9266 for (j = i - 1; j >= 0; j--) {
9267 c = PyUnicode_READ(kind, data, j);
9268 if (!_PyUnicode_IsCaseIgnorable(c))
9269 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009271 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9272 if (final_sigma) {
9273 for (j = i + 1; j < length; j++) {
9274 c = PyUnicode_READ(kind, data, j);
9275 if (!_PyUnicode_IsCaseIgnorable(c))
9276 break;
9277 }
9278 final_sigma = j == length || !_PyUnicode_IsCased(c);
9279 }
9280 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009281}
9282
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009283static int
9284lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9285 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009286{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009287 /* Obscure special case. */
9288 if (c == 0x3A3) {
9289 mapped[0] = handle_capital_sigma(kind, data, length, i);
9290 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009291 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009292 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293}
9294
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009295static Py_ssize_t
9296do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009297{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009298 Py_ssize_t i, k = 0;
9299 int n_res, j;
9300 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009301
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009302 c = PyUnicode_READ(kind, data, 0);
9303 n_res = _PyUnicode_ToUpperFull(c, mapped);
9304 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009305 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009306 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009307 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009308 for (i = 1; i < length; i++) {
9309 c = PyUnicode_READ(kind, data, i);
9310 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9311 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009312 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009313 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009314 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009315 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009316 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009317}
9318
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009319static Py_ssize_t
9320do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9321 Py_ssize_t i, k = 0;
9322
9323 for (i = 0; i < length; i++) {
9324 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9325 int n_res, j;
9326 if (Py_UNICODE_ISUPPER(c)) {
9327 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9328 }
9329 else if (Py_UNICODE_ISLOWER(c)) {
9330 n_res = _PyUnicode_ToUpperFull(c, mapped);
9331 }
9332 else {
9333 n_res = 1;
9334 mapped[0] = c;
9335 }
9336 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009337 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009338 res[k++] = mapped[j];
9339 }
9340 }
9341 return k;
9342}
9343
9344static Py_ssize_t
9345do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9346 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009347{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009348 Py_ssize_t i, k = 0;
9349
9350 for (i = 0; i < length; i++) {
9351 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9352 int n_res, j;
9353 if (lower)
9354 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9355 else
9356 n_res = _PyUnicode_ToUpperFull(c, mapped);
9357 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009358 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009359 res[k++] = mapped[j];
9360 }
9361 }
9362 return k;
9363}
9364
9365static Py_ssize_t
9366do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9367{
9368 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9369}
9370
9371static Py_ssize_t
9372do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9373{
9374 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9375}
9376
Benjamin Petersone51757f2012-01-12 21:10:29 -05009377static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009378do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9379{
9380 Py_ssize_t i, k = 0;
9381
9382 for (i = 0; i < length; i++) {
9383 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9384 Py_UCS4 mapped[3];
9385 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9386 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009387 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009388 res[k++] = mapped[j];
9389 }
9390 }
9391 return k;
9392}
9393
9394static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009395do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9396{
9397 Py_ssize_t i, k = 0;
9398 int previous_is_cased;
9399
9400 previous_is_cased = 0;
9401 for (i = 0; i < length; i++) {
9402 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9403 Py_UCS4 mapped[3];
9404 int n_res, j;
9405
9406 if (previous_is_cased)
9407 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9408 else
9409 n_res = _PyUnicode_ToTitleFull(c, mapped);
9410
9411 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009412 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009413 res[k++] = mapped[j];
9414 }
9415
9416 previous_is_cased = _PyUnicode_IsCased(c);
9417 }
9418 return k;
9419}
9420
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009421static PyObject *
9422case_operation(PyObject *self,
9423 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9424{
9425 PyObject *res = NULL;
9426 Py_ssize_t length, newlength = 0;
9427 int kind, outkind;
9428 void *data, *outdata;
9429 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9430
Benjamin Petersoneea48462012-01-16 14:28:50 -05009431 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009432
9433 kind = PyUnicode_KIND(self);
9434 data = PyUnicode_DATA(self);
9435 length = PyUnicode_GET_LENGTH(self);
9436 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9437 if (tmp == NULL)
9438 return PyErr_NoMemory();
9439 newlength = perform(kind, data, length, tmp, &maxchar);
9440 res = PyUnicode_New(newlength, maxchar);
9441 if (res == NULL)
9442 goto leave;
9443 tmpend = tmp + newlength;
9444 outdata = PyUnicode_DATA(res);
9445 outkind = PyUnicode_KIND(res);
9446 switch (outkind) {
9447 case PyUnicode_1BYTE_KIND:
9448 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9449 break;
9450 case PyUnicode_2BYTE_KIND:
9451 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9452 break;
9453 case PyUnicode_4BYTE_KIND:
9454 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9455 break;
9456 default:
9457 assert(0);
9458 break;
9459 }
9460 leave:
9461 PyMem_FREE(tmp);
9462 return res;
9463}
9464
Tim Peters8ce9f162004-08-27 01:49:32 +00009465PyObject *
9466PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009469 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009470 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009471 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009472 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9473 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009474 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009476 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009477 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009478 int use_memcpy;
9479 unsigned char *res_data = NULL, *sep_data = NULL;
9480 PyObject *last_obj;
9481 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482
Tim Peters05eba1f2004-08-27 21:32:02 +00009483 fseq = PySequence_Fast(seq, "");
9484 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009485 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009486 }
9487
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009488 /* NOTE: the following code can't call back into Python code,
9489 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009490 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009491
Tim Peters05eba1f2004-08-27 21:32:02 +00009492 seqlen = PySequence_Fast_GET_SIZE(fseq);
9493 /* If empty sequence, return u"". */
9494 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009495 Py_DECREF(fseq);
9496 Py_INCREF(unicode_empty);
9497 res = unicode_empty;
9498 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009499 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009500
Tim Peters05eba1f2004-08-27 21:32:02 +00009501 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009502 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009503 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009504 if (seqlen == 1) {
9505 if (PyUnicode_CheckExact(items[0])) {
9506 res = items[0];
9507 Py_INCREF(res);
9508 Py_DECREF(fseq);
9509 return res;
9510 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009511 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009512 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009513 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009514 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009515 /* Set up sep and seplen */
9516 if (separator == NULL) {
9517 /* fall back to a blank space separator */
9518 sep = PyUnicode_FromOrdinal(' ');
9519 if (!sep)
9520 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009521 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009522 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009523 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009524 else {
9525 if (!PyUnicode_Check(separator)) {
9526 PyErr_Format(PyExc_TypeError,
9527 "separator: expected str instance,"
9528 " %.80s found",
9529 Py_TYPE(separator)->tp_name);
9530 goto onError;
9531 }
9532 if (PyUnicode_READY(separator))
9533 goto onError;
9534 sep = separator;
9535 seplen = PyUnicode_GET_LENGTH(separator);
9536 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9537 /* inc refcount to keep this code path symmetric with the
9538 above case of a blank separator */
9539 Py_INCREF(sep);
9540 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009541 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009542 }
9543
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009544 /* There are at least two things to join, or else we have a subclass
9545 * of str in the sequence.
9546 * Do a pre-pass to figure out the total amount of space we'll
9547 * need (sz), and see whether all argument are strings.
9548 */
9549 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009550#ifdef Py_DEBUG
9551 use_memcpy = 0;
9552#else
9553 use_memcpy = 1;
9554#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009555 for (i = 0; i < seqlen; i++) {
9556 const Py_ssize_t old_sz = sz;
9557 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009558 if (!PyUnicode_Check(item)) {
9559 PyErr_Format(PyExc_TypeError,
9560 "sequence item %zd: expected str instance,"
9561 " %.80s found",
9562 i, Py_TYPE(item)->tp_name);
9563 goto onError;
9564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 if (PyUnicode_READY(item) == -1)
9566 goto onError;
9567 sz += PyUnicode_GET_LENGTH(item);
9568 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009569 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009570 if (i != 0)
9571 sz += seplen;
9572 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9573 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009574 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009575 goto onError;
9576 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009577 if (use_memcpy && last_obj != NULL) {
9578 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9579 use_memcpy = 0;
9580 }
9581 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009582 }
Tim Petersced69f82003-09-16 20:30:58 +00009583
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009585 if (res == NULL)
9586 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009587
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009588 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009589#ifdef Py_DEBUG
9590 use_memcpy = 0;
9591#else
9592 if (use_memcpy) {
9593 res_data = PyUnicode_1BYTE_DATA(res);
9594 kind = PyUnicode_KIND(res);
9595 if (seplen != 0)
9596 sep_data = PyUnicode_1BYTE_DATA(sep);
9597 }
9598#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009600 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009601 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009602 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009603 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009604 if (use_memcpy) {
9605 Py_MEMCPY(res_data,
9606 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009607 kind * seplen);
9608 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009609 }
9610 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009611 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009612 res_offset += seplen;
9613 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009614 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009615 itemlen = PyUnicode_GET_LENGTH(item);
9616 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009617 if (use_memcpy) {
9618 Py_MEMCPY(res_data,
9619 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009620 kind * itemlen);
9621 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009622 }
9623 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009624 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009625 res_offset += itemlen;
9626 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009627 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009628 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009629 if (use_memcpy)
9630 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009631 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009632 else
9633 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009634
Tim Peters05eba1f2004-08-27 21:32:02 +00009635 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009637 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009639
Benjamin Peterson29060642009-01-31 22:14:21 +00009640 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009641 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009643 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009644 return NULL;
9645}
9646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647#define FILL(kind, data, value, start, length) \
9648 do { \
9649 Py_ssize_t i_ = 0; \
9650 assert(kind != PyUnicode_WCHAR_KIND); \
9651 switch ((kind)) { \
9652 case PyUnicode_1BYTE_KIND: { \
9653 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009654 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655 break; \
9656 } \
9657 case PyUnicode_2BYTE_KIND: { \
9658 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9659 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9660 break; \
9661 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009662 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009663 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9664 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9665 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009666 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667 } \
9668 } \
9669 } while (0)
9670
Victor Stinnerd3f08822012-05-29 12:57:52 +02009671void
9672_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9673 Py_UCS4 fill_char)
9674{
9675 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9676 const void *data = PyUnicode_DATA(unicode);
9677 assert(PyUnicode_IS_READY(unicode));
9678 assert(unicode_modifiable(unicode));
9679 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9680 assert(start >= 0);
9681 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9682 FILL(kind, data, fill_char, start, length);
9683}
9684
Victor Stinner3fe55312012-01-04 00:33:50 +01009685Py_ssize_t
9686PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9687 Py_UCS4 fill_char)
9688{
9689 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009690
9691 if (!PyUnicode_Check(unicode)) {
9692 PyErr_BadInternalCall();
9693 return -1;
9694 }
9695 if (PyUnicode_READY(unicode) == -1)
9696 return -1;
9697 if (unicode_check_modifiable(unicode))
9698 return -1;
9699
Victor Stinnerd3f08822012-05-29 12:57:52 +02009700 if (start < 0) {
9701 PyErr_SetString(PyExc_IndexError, "string index out of range");
9702 return -1;
9703 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009704 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9705 PyErr_SetString(PyExc_ValueError,
9706 "fill character is bigger than "
9707 "the string maximum character");
9708 return -1;
9709 }
9710
9711 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9712 length = Py_MIN(maxlen, length);
9713 if (length <= 0)
9714 return 0;
9715
Victor Stinnerd3f08822012-05-29 12:57:52 +02009716 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009717 return length;
9718}
9719
Victor Stinner9310abb2011-10-05 00:59:23 +02009720static PyObject *
9721pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009722 Py_ssize_t left,
9723 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009724 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009725{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 PyObject *u;
9727 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009728 int kind;
9729 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730
9731 if (left < 0)
9732 left = 0;
9733 if (right < 0)
9734 right = 0;
9735
Victor Stinnerc4b49542011-12-11 22:44:26 +01009736 if (left == 0 && right == 0)
9737 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9740 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009741 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9742 return NULL;
9743 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009744 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009745 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009746 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009747 if (!u)
9748 return NULL;
9749
9750 kind = PyUnicode_KIND(u);
9751 data = PyUnicode_DATA(u);
9752 if (left)
9753 FILL(kind, data, fill, 0, left);
9754 if (right)
9755 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009756 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009757 assert(_PyUnicode_CheckConsistency(u, 1));
9758 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759}
9760
Alexander Belopolsky40018472011-02-26 01:02:56 +00009761PyObject *
9762PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009763{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009764 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765
9766 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009767 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009768 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009769 if (PyUnicode_READY(string) == -1) {
9770 Py_DECREF(string);
9771 return NULL;
9772 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009773
Benjamin Petersonead6b532011-12-20 17:23:42 -06009774 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009775 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009776 if (PyUnicode_IS_ASCII(string))
9777 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009778 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009779 PyUnicode_GET_LENGTH(string), keepends);
9780 else
9781 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009782 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009783 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 break;
9785 case PyUnicode_2BYTE_KIND:
9786 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009787 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009788 PyUnicode_GET_LENGTH(string), keepends);
9789 break;
9790 case PyUnicode_4BYTE_KIND:
9791 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009792 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009793 PyUnicode_GET_LENGTH(string), keepends);
9794 break;
9795 default:
9796 assert(0);
9797 list = 0;
9798 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009799 Py_DECREF(string);
9800 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009801}
9802
Alexander Belopolsky40018472011-02-26 01:02:56 +00009803static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009804split(PyObject *self,
9805 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009806 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 int kind1, kind2, kind;
9809 void *buf1, *buf2;
9810 Py_ssize_t len1, len2;
9811 PyObject* out;
9812
Guido van Rossumd57fd912000-03-10 22:53:23 +00009813 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009814 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 if (PyUnicode_READY(self) == -1)
9817 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009819 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009820 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009821 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009822 if (PyUnicode_IS_ASCII(self))
9823 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009824 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009825 PyUnicode_GET_LENGTH(self), maxcount
9826 );
9827 else
9828 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009829 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009830 PyUnicode_GET_LENGTH(self), maxcount
9831 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 case PyUnicode_2BYTE_KIND:
9833 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009834 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009835 PyUnicode_GET_LENGTH(self), maxcount
9836 );
9837 case PyUnicode_4BYTE_KIND:
9838 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009839 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009840 PyUnicode_GET_LENGTH(self), maxcount
9841 );
9842 default:
9843 assert(0);
9844 return NULL;
9845 }
9846
9847 if (PyUnicode_READY(substring) == -1)
9848 return NULL;
9849
9850 kind1 = PyUnicode_KIND(self);
9851 kind2 = PyUnicode_KIND(substring);
9852 kind = kind1 > kind2 ? kind1 : kind2;
9853 buf1 = PyUnicode_DATA(self);
9854 buf2 = PyUnicode_DATA(substring);
9855 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009856 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 if (!buf1)
9858 return NULL;
9859 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009860 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 if (!buf2) {
9862 if (kind1 != kind) PyMem_Free(buf1);
9863 return NULL;
9864 }
9865 len1 = PyUnicode_GET_LENGTH(self);
9866 len2 = PyUnicode_GET_LENGTH(substring);
9867
Benjamin Petersonead6b532011-12-20 17:23:42 -06009868 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009870 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9871 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009872 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009873 else
9874 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009875 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 break;
9877 case PyUnicode_2BYTE_KIND:
9878 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009879 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 break;
9881 case PyUnicode_4BYTE_KIND:
9882 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009883 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 break;
9885 default:
9886 out = NULL;
9887 }
9888 if (kind1 != kind)
9889 PyMem_Free(buf1);
9890 if (kind2 != kind)
9891 PyMem_Free(buf2);
9892 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893}
9894
Alexander Belopolsky40018472011-02-26 01:02:56 +00009895static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009896rsplit(PyObject *self,
9897 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009898 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009899{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 int kind1, kind2, kind;
9901 void *buf1, *buf2;
9902 Py_ssize_t len1, len2;
9903 PyObject* out;
9904
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009905 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009906 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908 if (PyUnicode_READY(self) == -1)
9909 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009911 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009912 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009914 if (PyUnicode_IS_ASCII(self))
9915 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009916 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009917 PyUnicode_GET_LENGTH(self), maxcount
9918 );
9919 else
9920 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009921 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009922 PyUnicode_GET_LENGTH(self), maxcount
9923 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009924 case PyUnicode_2BYTE_KIND:
9925 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009926 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009927 PyUnicode_GET_LENGTH(self), maxcount
9928 );
9929 case PyUnicode_4BYTE_KIND:
9930 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009931 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 PyUnicode_GET_LENGTH(self), maxcount
9933 );
9934 default:
9935 assert(0);
9936 return NULL;
9937 }
9938
9939 if (PyUnicode_READY(substring) == -1)
9940 return NULL;
9941
9942 kind1 = PyUnicode_KIND(self);
9943 kind2 = PyUnicode_KIND(substring);
9944 kind = kind1 > kind2 ? kind1 : kind2;
9945 buf1 = PyUnicode_DATA(self);
9946 buf2 = PyUnicode_DATA(substring);
9947 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009948 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 if (!buf1)
9950 return NULL;
9951 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009952 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953 if (!buf2) {
9954 if (kind1 != kind) PyMem_Free(buf1);
9955 return NULL;
9956 }
9957 len1 = PyUnicode_GET_LENGTH(self);
9958 len2 = PyUnicode_GET_LENGTH(substring);
9959
Benjamin Petersonead6b532011-12-20 17:23:42 -06009960 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009962 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9963 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009964 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009965 else
9966 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009967 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 break;
9969 case PyUnicode_2BYTE_KIND:
9970 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009971 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 break;
9973 case PyUnicode_4BYTE_KIND:
9974 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009975 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009976 break;
9977 default:
9978 out = NULL;
9979 }
9980 if (kind1 != kind)
9981 PyMem_Free(buf1);
9982 if (kind2 != kind)
9983 PyMem_Free(buf2);
9984 return out;
9985}
9986
9987static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009988anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9989 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009991 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009993 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9994 return asciilib_find(buf1, len1, buf2, len2, offset);
9995 else
9996 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009997 case PyUnicode_2BYTE_KIND:
9998 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9999 case PyUnicode_4BYTE_KIND:
10000 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10001 }
10002 assert(0);
10003 return -1;
10004}
10005
10006static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010007anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10008 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010010 switch (kind) {
10011 case PyUnicode_1BYTE_KIND:
10012 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10013 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10014 else
10015 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10016 case PyUnicode_2BYTE_KIND:
10017 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10018 case PyUnicode_4BYTE_KIND:
10019 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10020 }
10021 assert(0);
10022 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010023}
10024
Alexander Belopolsky40018472011-02-26 01:02:56 +000010025static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026replace(PyObject *self, PyObject *str1,
10027 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 PyObject *u;
10030 char *sbuf = PyUnicode_DATA(self);
10031 char *buf1 = PyUnicode_DATA(str1);
10032 char *buf2 = PyUnicode_DATA(str2);
10033 int srelease = 0, release1 = 0, release2 = 0;
10034 int skind = PyUnicode_KIND(self);
10035 int kind1 = PyUnicode_KIND(str1);
10036 int kind2 = PyUnicode_KIND(str2);
10037 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10038 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10039 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010040 int mayshrink;
10041 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010042
10043 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010044 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010046 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010047
Victor Stinner59de0ee2011-10-07 10:01:28 +020010048 if (str1 == str2)
10049 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050 if (skind < kind1)
10051 /* substring too wide to be present */
10052 goto nothing;
10053
Victor Stinner49a0a212011-10-12 23:46:10 +020010054 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10055 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10056 /* Replacing str1 with str2 may cause a maxchar reduction in the
10057 result string. */
10058 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +020010059 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010062 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010063 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010064 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010065 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010066 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010067 Py_UCS4 u1, u2;
10068 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010069 Py_ssize_t index, pos;
10070 char *src;
10071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010073 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10074 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010075 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010076 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010078 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010080 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010082
10083 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10084 index = 0;
10085 src = sbuf;
10086 while (--maxcount)
10087 {
10088 pos++;
10089 src += pos * PyUnicode_KIND(self);
10090 slen -= pos;
10091 index += pos;
10092 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10093 if (pos < 0)
10094 break;
10095 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10096 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010097 }
10098 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 int rkind = skind;
10100 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010101 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 if (kind1 < rkind) {
10104 /* widen substring */
10105 buf1 = _PyUnicode_AsKind(str1, rkind);
10106 if (!buf1) goto error;
10107 release1 = 1;
10108 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010109 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010110 if (i < 0)
10111 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 if (rkind > kind2) {
10113 /* widen replacement */
10114 buf2 = _PyUnicode_AsKind(str2, rkind);
10115 if (!buf2) goto error;
10116 release2 = 1;
10117 }
10118 else if (rkind < kind2) {
10119 /* widen self and buf1 */
10120 rkind = kind2;
10121 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010122 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 sbuf = _PyUnicode_AsKind(self, rkind);
10124 if (!sbuf) goto error;
10125 srelease = 1;
10126 buf1 = _PyUnicode_AsKind(str1, rkind);
10127 if (!buf1) goto error;
10128 release1 = 1;
10129 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010130 u = PyUnicode_New(slen, maxchar);
10131 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010133 assert(PyUnicode_KIND(u) == rkind);
10134 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010135
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010136 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010137 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010138 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010140 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010142
10143 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010144 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010145 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010146 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010147 if (i == -1)
10148 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010149 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010150 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010151 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010153 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010154 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010155 }
10156 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010158 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 int rkind = skind;
10160 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010163 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 buf1 = _PyUnicode_AsKind(str1, rkind);
10165 if (!buf1) goto error;
10166 release1 = 1;
10167 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010168 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010169 if (n == 0)
10170 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010172 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 buf2 = _PyUnicode_AsKind(str2, rkind);
10174 if (!buf2) goto error;
10175 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010178 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 rkind = kind2;
10180 sbuf = _PyUnicode_AsKind(self, rkind);
10181 if (!sbuf) goto error;
10182 srelease = 1;
10183 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010184 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 buf1 = _PyUnicode_AsKind(str1, rkind);
10186 if (!buf1) goto error;
10187 release1 = 1;
10188 }
10189 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10190 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010191 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 PyErr_SetString(PyExc_OverflowError,
10193 "replace string is too long");
10194 goto error;
10195 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010196 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010197 if (new_size == 0) {
10198 Py_INCREF(unicode_empty);
10199 u = unicode_empty;
10200 goto done;
10201 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010202 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 PyErr_SetString(PyExc_OverflowError,
10204 "replace string is too long");
10205 goto error;
10206 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010207 u = PyUnicode_New(new_size, maxchar);
10208 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010210 assert(PyUnicode_KIND(u) == rkind);
10211 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 ires = i = 0;
10213 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010214 while (n-- > 0) {
10215 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010216 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010217 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010218 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010219 if (j == -1)
10220 break;
10221 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010222 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010223 memcpy(res + rkind * ires,
10224 sbuf + rkind * i,
10225 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010227 }
10228 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010230 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010232 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010236 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010238 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010239 memcpy(res + rkind * ires,
10240 sbuf + rkind * i,
10241 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010242 }
10243 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010244 /* interleave */
10245 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010246 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010248 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010250 if (--n <= 0)
10251 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010252 memcpy(res + rkind * ires,
10253 sbuf + rkind * i,
10254 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010255 ires++;
10256 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010257 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010258 memcpy(res + rkind * ires,
10259 sbuf + rkind * i,
10260 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010261 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010262 }
10263
10264 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010265 unicode_adjust_maxchar(&u);
10266 if (u == NULL)
10267 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010269
10270 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 if (srelease)
10272 PyMem_FREE(sbuf);
10273 if (release1)
10274 PyMem_FREE(buf1);
10275 if (release2)
10276 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010277 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010279
Benjamin Peterson29060642009-01-31 22:14:21 +000010280 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010281 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 if (srelease)
10283 PyMem_FREE(sbuf);
10284 if (release1)
10285 PyMem_FREE(buf1);
10286 if (release2)
10287 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010288 return unicode_result_unchanged(self);
10289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 error:
10291 if (srelease && sbuf)
10292 PyMem_FREE(sbuf);
10293 if (release1 && buf1)
10294 PyMem_FREE(buf1);
10295 if (release2 && buf2)
10296 PyMem_FREE(buf2);
10297 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298}
10299
10300/* --- Unicode Object Methods --------------------------------------------- */
10301
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010302PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010303 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304\n\
10305Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010306characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010307
10308static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010309unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010310{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010311 if (PyUnicode_READY(self) == -1)
10312 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010313 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010314}
10315
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010316PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010317 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318\n\
10319Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010320have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321
10322static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010323unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010325 if (PyUnicode_READY(self) == -1)
10326 return NULL;
10327 if (PyUnicode_GET_LENGTH(self) == 0)
10328 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010329 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010330}
10331
Benjamin Petersond5890c82012-01-14 13:23:30 -050010332PyDoc_STRVAR(casefold__doc__,
10333 "S.casefold() -> str\n\
10334\n\
10335Return a version of S suitable for caseless comparisons.");
10336
10337static PyObject *
10338unicode_casefold(PyObject *self)
10339{
10340 if (PyUnicode_READY(self) == -1)
10341 return NULL;
10342 if (PyUnicode_IS_ASCII(self))
10343 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010344 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010345}
10346
10347
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010348/* Argument converter. Coerces to a single unicode character */
10349
10350static int
10351convert_uc(PyObject *obj, void *addr)
10352{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010354 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010355
Benjamin Peterson14339b62009-01-31 16:36:08 +000010356 uniobj = PyUnicode_FromObject(obj);
10357 if (uniobj == NULL) {
10358 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010359 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010360 return 0;
10361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010363 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010364 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010365 Py_DECREF(uniobj);
10366 return 0;
10367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010369 Py_DECREF(uniobj);
10370 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010371}
10372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010373PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010374 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010376Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010377done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010378
10379static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010380unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010382 Py_ssize_t marg, left;
10383 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 Py_UCS4 fillchar = ' ';
10385
Victor Stinnere9a29352011-10-01 02:14:59 +020010386 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388
Benjamin Petersonbac79492012-01-14 13:34:47 -050010389 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390 return NULL;
10391
Victor Stinnerc4b49542011-12-11 22:44:26 +010010392 if (PyUnicode_GET_LENGTH(self) >= width)
10393 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394
Victor Stinnerc4b49542011-12-11 22:44:26 +010010395 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396 left = marg / 2 + (marg & width & 1);
10397
Victor Stinner9310abb2011-10-05 00:59:23 +020010398 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399}
10400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401/* This function assumes that str1 and str2 are readied by the caller. */
10402
Marc-André Lemburge5034372000-08-08 08:04:29 +000010403static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010404unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010405{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010406 int kind1, kind2;
10407 void *data1, *data2;
10408 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 kind1 = PyUnicode_KIND(str1);
10411 kind2 = PyUnicode_KIND(str2);
10412 data1 = PyUnicode_DATA(str1);
10413 data2 = PyUnicode_DATA(str2);
10414 len1 = PyUnicode_GET_LENGTH(str1);
10415 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417 for (i = 0; i < len1 && i < len2; ++i) {
10418 Py_UCS4 c1, c2;
10419 c1 = PyUnicode_READ(kind1, data1, i);
10420 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010421
10422 if (c1 != c2)
10423 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010424 }
10425
10426 return (len1 < len2) ? -1 : (len1 != len2);
10427}
10428
Alexander Belopolsky40018472011-02-26 01:02:56 +000010429int
10430PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10433 if (PyUnicode_READY(left) == -1 ||
10434 PyUnicode_READY(right) == -1)
10435 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010436 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010438 PyErr_Format(PyExc_TypeError,
10439 "Can't compare %.100s and %.100s",
10440 left->ob_type->tp_name,
10441 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010442 return -1;
10443}
10444
Martin v. Löwis5b222132007-06-10 09:51:05 +000010445int
10446PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 Py_ssize_t i;
10449 int kind;
10450 void *data;
10451 Py_UCS4 chr;
10452
Victor Stinner910337b2011-10-03 03:20:16 +020010453 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 if (PyUnicode_READY(uni) == -1)
10455 return -1;
10456 kind = PyUnicode_KIND(uni);
10457 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010458 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10460 if (chr != str[i])
10461 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010462 /* This check keeps Python strings that end in '\0' from comparing equal
10463 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010465 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010466 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010467 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010468 return 0;
10469}
10470
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010471
Benjamin Peterson29060642009-01-31 22:14:21 +000010472#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010473 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010474
Alexander Belopolsky40018472011-02-26 01:02:56 +000010475PyObject *
10476PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010477{
10478 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010479
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010480 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10481 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 if (PyUnicode_READY(left) == -1 ||
10483 PyUnicode_READY(right) == -1)
10484 return NULL;
10485 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10486 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010487 if (op == Py_EQ) {
10488 Py_INCREF(Py_False);
10489 return Py_False;
10490 }
10491 if (op == Py_NE) {
10492 Py_INCREF(Py_True);
10493 return Py_True;
10494 }
10495 }
10496 if (left == right)
10497 result = 0;
10498 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010499 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010500
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010501 /* Convert the return value to a Boolean */
10502 switch (op) {
10503 case Py_EQ:
10504 v = TEST_COND(result == 0);
10505 break;
10506 case Py_NE:
10507 v = TEST_COND(result != 0);
10508 break;
10509 case Py_LE:
10510 v = TEST_COND(result <= 0);
10511 break;
10512 case Py_GE:
10513 v = TEST_COND(result >= 0);
10514 break;
10515 case Py_LT:
10516 v = TEST_COND(result == -1);
10517 break;
10518 case Py_GT:
10519 v = TEST_COND(result == 1);
10520 break;
10521 default:
10522 PyErr_BadArgument();
10523 return NULL;
10524 }
10525 Py_INCREF(v);
10526 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010527 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010528
Brian Curtindfc80e32011-08-10 20:28:54 -050010529 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010530}
10531
Alexander Belopolsky40018472011-02-26 01:02:56 +000010532int
10533PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010534{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010535 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 int kind1, kind2, kind;
10537 void *buf1, *buf2;
10538 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010539 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010540
10541 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010542 sub = PyUnicode_FromObject(element);
10543 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010544 PyErr_Format(PyExc_TypeError,
10545 "'in <string>' requires string as left operand, not %s",
10546 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010547 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010548 }
10549
Thomas Wouters477c8d52006-05-27 19:21:47 +000010550 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010551 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010552 Py_DECREF(sub);
10553 return -1;
10554 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010555 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10556 Py_DECREF(sub);
10557 Py_DECREF(str);
10558 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 kind1 = PyUnicode_KIND(str);
10561 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010562 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 buf1 = PyUnicode_DATA(str);
10564 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010565 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010566 if (kind2 > kind) {
10567 Py_DECREF(sub);
10568 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010569 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010570 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010571 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010572 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 if (!buf2) {
10574 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010575 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 return -1;
10577 }
10578 len1 = PyUnicode_GET_LENGTH(str);
10579 len2 = PyUnicode_GET_LENGTH(sub);
10580
Benjamin Petersonead6b532011-12-20 17:23:42 -060010581 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 case PyUnicode_1BYTE_KIND:
10583 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10584 break;
10585 case PyUnicode_2BYTE_KIND:
10586 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10587 break;
10588 case PyUnicode_4BYTE_KIND:
10589 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10590 break;
10591 default:
10592 result = -1;
10593 assert(0);
10594 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010595
10596 Py_DECREF(str);
10597 Py_DECREF(sub);
10598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 if (kind2 != kind)
10600 PyMem_Free(buf2);
10601
Guido van Rossum403d68b2000-03-13 15:55:09 +000010602 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010603}
10604
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605/* Concat to string or Unicode object giving a new Unicode object. */
10606
Alexander Belopolsky40018472011-02-26 01:02:56 +000010607PyObject *
10608PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010611 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010612 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613
10614 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010617 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010620 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621
10622 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010623 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010624 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010626 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010627 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010628 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010629 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630 }
10631
Victor Stinner488fa492011-12-12 00:01:39 +010010632 u_len = PyUnicode_GET_LENGTH(u);
10633 v_len = PyUnicode_GET_LENGTH(v);
10634 if (u_len > PY_SSIZE_T_MAX - v_len) {
10635 PyErr_SetString(PyExc_OverflowError,
10636 "strings are too large to concat");
10637 goto onError;
10638 }
10639 new_len = u_len + v_len;
10640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010642 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010643 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010646 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010647 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010648 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010649 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10650 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010651 Py_DECREF(u);
10652 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010653 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655
Benjamin Peterson29060642009-01-31 22:14:21 +000010656 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010657 Py_XDECREF(u);
10658 Py_XDECREF(v);
10659 return NULL;
10660}
10661
Walter Dörwald1ab83302007-05-18 17:15:44 +000010662void
Victor Stinner23e56682011-10-03 03:54:37 +020010663PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010664{
Victor Stinner23e56682011-10-03 03:54:37 +020010665 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010666 Py_UCS4 maxchar, maxchar2;
10667 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010668
10669 if (p_left == NULL) {
10670 if (!PyErr_Occurred())
10671 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010672 return;
10673 }
Victor Stinner23e56682011-10-03 03:54:37 +020010674 left = *p_left;
10675 if (right == NULL || !PyUnicode_Check(left)) {
10676 if (!PyErr_Occurred())
10677 PyErr_BadInternalCall();
10678 goto error;
10679 }
10680
Benjamin Petersonbac79492012-01-14 13:34:47 -050010681 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010682 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010683 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010684 goto error;
10685
Victor Stinner488fa492011-12-12 00:01:39 +010010686 /* Shortcuts */
10687 if (left == unicode_empty) {
10688 Py_DECREF(left);
10689 Py_INCREF(right);
10690 *p_left = right;
10691 return;
10692 }
10693 if (right == unicode_empty)
10694 return;
10695
10696 left_len = PyUnicode_GET_LENGTH(left);
10697 right_len = PyUnicode_GET_LENGTH(right);
10698 if (left_len > PY_SSIZE_T_MAX - right_len) {
10699 PyErr_SetString(PyExc_OverflowError,
10700 "strings are too large to concat");
10701 goto error;
10702 }
10703 new_len = left_len + right_len;
10704
10705 if (unicode_modifiable(left)
10706 && PyUnicode_CheckExact(right)
10707 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010708 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10709 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010710 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010711 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010712 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10713 {
10714 /* append inplace */
10715 if (unicode_resize(p_left, new_len) != 0) {
10716 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10717 * deallocated so it cannot be put back into
10718 * 'variable'. The MemoryError is raised when there
10719 * is no value in 'variable', which might (very
10720 * remotely) be a cause of incompatibilities.
10721 */
10722 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010723 }
Victor Stinner488fa492011-12-12 00:01:39 +010010724 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010725 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010726 }
Victor Stinner488fa492011-12-12 00:01:39 +010010727 else {
10728 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10729 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010730 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010731
Victor Stinner488fa492011-12-12 00:01:39 +010010732 /* Concat the two Unicode strings */
10733 res = PyUnicode_New(new_len, maxchar);
10734 if (res == NULL)
10735 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010736 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10737 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010738 Py_DECREF(left);
10739 *p_left = res;
10740 }
10741 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010742 return;
10743
10744error:
Victor Stinner488fa492011-12-12 00:01:39 +010010745 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010746}
10747
10748void
10749PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10750{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010751 PyUnicode_Append(pleft, right);
10752 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010753}
10754
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010755PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010756 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010758Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010759string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010760interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761
10762static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010763unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010764{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010765 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010766 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010767 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 int kind1, kind2, kind;
10770 void *buf1, *buf2;
10771 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010772
Jesus Ceaac451502011-04-20 17:09:23 +020010773 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10774 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010775 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 kind1 = PyUnicode_KIND(self);
10778 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010779 if (kind2 > kind1)
10780 return PyLong_FromLong(0);
10781 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 buf1 = PyUnicode_DATA(self);
10783 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010785 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 if (!buf2) {
10787 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 return NULL;
10789 }
10790 len1 = PyUnicode_GET_LENGTH(self);
10791 len2 = PyUnicode_GET_LENGTH(substring);
10792
10793 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010794 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 case PyUnicode_1BYTE_KIND:
10796 iresult = ucs1lib_count(
10797 ((Py_UCS1*)buf1) + start, end - start,
10798 buf2, len2, PY_SSIZE_T_MAX
10799 );
10800 break;
10801 case PyUnicode_2BYTE_KIND:
10802 iresult = ucs2lib_count(
10803 ((Py_UCS2*)buf1) + start, end - start,
10804 buf2, len2, PY_SSIZE_T_MAX
10805 );
10806 break;
10807 case PyUnicode_4BYTE_KIND:
10808 iresult = ucs4lib_count(
10809 ((Py_UCS4*)buf1) + start, end - start,
10810 buf2, len2, PY_SSIZE_T_MAX
10811 );
10812 break;
10813 default:
10814 assert(0); iresult = 0;
10815 }
10816
10817 result = PyLong_FromSsize_t(iresult);
10818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 if (kind2 != kind)
10820 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010821
10822 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010823
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824 return result;
10825}
10826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010827PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010828 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010829\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010830Encode S using the codec registered for encoding. Default encoding\n\
10831is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010832handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010833a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10834'xmlcharrefreplace' as well as any other name registered with\n\
10835codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836
10837static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010838unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010840 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841 char *encoding = NULL;
10842 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010843
Benjamin Peterson308d6372009-09-18 21:42:35 +000010844 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10845 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010847 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010848}
10849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010850PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010851 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852\n\
10853Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010854If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855
10856static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010857unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010859 Py_ssize_t i, j, line_pos, src_len, incr;
10860 Py_UCS4 ch;
10861 PyObject *u;
10862 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010864 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010865 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010866
10867 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010868 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010869
Antoine Pitrou22425222011-10-04 19:10:51 +020010870 if (PyUnicode_READY(self) == -1)
10871 return NULL;
10872
Thomas Wouters7e474022000-07-16 12:04:32 +000010873 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010874 src_len = PyUnicode_GET_LENGTH(self);
10875 i = j = line_pos = 0;
10876 kind = PyUnicode_KIND(self);
10877 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010878 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010879 for (; i < src_len; i++) {
10880 ch = PyUnicode_READ(kind, src_data, i);
10881 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010882 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010883 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010884 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010885 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010886 goto overflow;
10887 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010888 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010889 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010890 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010891 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010892 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010893 goto overflow;
10894 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010895 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010896 if (ch == '\n' || ch == '\r')
10897 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010898 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010899 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010900 if (!found)
10901 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010902
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010904 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905 if (!u)
10906 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010907 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908
Antoine Pitroue71d5742011-10-04 15:55:09 +020010909 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010910
Antoine Pitroue71d5742011-10-04 15:55:09 +020010911 for (; i < src_len; i++) {
10912 ch = PyUnicode_READ(kind, src_data, i);
10913 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010914 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010915 incr = tabsize - (line_pos % tabsize);
10916 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010917 FILL(kind, dest_data, ' ', j, incr);
10918 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010919 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010920 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010921 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010922 line_pos++;
10923 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010924 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010925 if (ch == '\n' || ch == '\r')
10926 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010928 }
10929 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010930 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010931
Antoine Pitroue71d5742011-10-04 15:55:09 +020010932 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010933 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10934 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935}
10936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010937PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010938 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939\n\
10940Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010941such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942arguments start and end are interpreted as in slice notation.\n\
10943\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010944Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945
10946static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010949 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010950 Py_ssize_t start;
10951 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010952 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953
Jesus Ceaac451502011-04-20 17:09:23 +020010954 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10955 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 if (PyUnicode_READY(self) == -1)
10959 return NULL;
10960 if (PyUnicode_READY(substring) == -1)
10961 return NULL;
10962
Victor Stinner7931d9a2011-11-04 00:22:48 +010010963 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010964
10965 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010966
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 if (result == -2)
10968 return NULL;
10969
Christian Heimes217cfd12007-12-02 14:31:20 +000010970 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971}
10972
10973static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010974unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010976 void *data;
10977 enum PyUnicode_Kind kind;
10978 Py_UCS4 ch;
10979 PyObject *res;
10980
10981 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10982 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010983 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010984 }
10985 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10986 PyErr_SetString(PyExc_IndexError, "string index out of range");
10987 return NULL;
10988 }
10989 kind = PyUnicode_KIND(self);
10990 data = PyUnicode_DATA(self);
10991 ch = PyUnicode_READ(kind, data, index);
10992 if (ch < 256)
10993 return get_latin1_char(ch);
10994
10995 res = PyUnicode_New(1, ch);
10996 if (res == NULL)
10997 return NULL;
10998 kind = PyUnicode_KIND(res);
10999 data = PyUnicode_DATA(res);
11000 PyUnicode_WRITE(kind, data, 0, ch);
11001 assert(_PyUnicode_CheckConsistency(res, 1));
11002 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003}
11004
Guido van Rossumc2504932007-09-18 19:42:40 +000011005/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011006 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011007static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011008unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009{
Guido van Rossumc2504932007-09-18 19:42:40 +000011010 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011011 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011012
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011013#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011014 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011015#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 if (_PyUnicode_HASH(self) != -1)
11017 return _PyUnicode_HASH(self);
11018 if (PyUnicode_READY(self) == -1)
11019 return -1;
11020 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011021 /*
11022 We make the hash of the empty string be 0, rather than using
11023 (prefix ^ suffix), since this slightly obfuscates the hash secret
11024 */
11025 if (len == 0) {
11026 _PyUnicode_HASH(self) = 0;
11027 return 0;
11028 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029
11030 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011031#define HASH(P) \
11032 x ^= (Py_uhash_t) *P << 7; \
11033 while (--len >= 0) \
11034 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035
Georg Brandl2fb477c2012-02-21 00:33:36 +010011036 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 switch (PyUnicode_KIND(self)) {
11038 case PyUnicode_1BYTE_KIND: {
11039 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11040 HASH(c);
11041 break;
11042 }
11043 case PyUnicode_2BYTE_KIND: {
11044 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11045 HASH(s);
11046 break;
11047 }
11048 default: {
11049 Py_UCS4 *l;
11050 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11051 "Impossible switch case in unicode_hash");
11052 l = PyUnicode_4BYTE_DATA(self);
11053 HASH(l);
11054 break;
11055 }
11056 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011057 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11058 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011059
Guido van Rossumc2504932007-09-18 19:42:40 +000011060 if (x == -1)
11061 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011062 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011063 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011067PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011068 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011070Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071
11072static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011073unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011075 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011076 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011077 Py_ssize_t start;
11078 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079
Jesus Ceaac451502011-04-20 17:09:23 +020011080 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11081 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011083
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011084 if (PyUnicode_READY(self) == -1)
11085 return NULL;
11086 if (PyUnicode_READY(substring) == -1)
11087 return NULL;
11088
Victor Stinner7931d9a2011-11-04 00:22:48 +010011089 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090
11091 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 if (result == -2)
11094 return NULL;
11095
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096 if (result < 0) {
11097 PyErr_SetString(PyExc_ValueError, "substring not found");
11098 return NULL;
11099 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011100
Christian Heimes217cfd12007-12-02 14:31:20 +000011101 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011102}
11103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011104PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011105 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011107Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011108at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109
11110static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011111unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011113 Py_ssize_t i, length;
11114 int kind;
11115 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011116 int cased;
11117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011118 if (PyUnicode_READY(self) == -1)
11119 return NULL;
11120 length = PyUnicode_GET_LENGTH(self);
11121 kind = PyUnicode_KIND(self);
11122 data = PyUnicode_DATA(self);
11123
Guido van Rossumd57fd912000-03-10 22:53:23 +000011124 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011125 if (length == 1)
11126 return PyBool_FromLong(
11127 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011129 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011130 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011131 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011132
Guido van Rossumd57fd912000-03-10 22:53:23 +000011133 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134 for (i = 0; i < length; i++) {
11135 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011136
Benjamin Peterson29060642009-01-31 22:14:21 +000011137 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11138 return PyBool_FromLong(0);
11139 else if (!cased && Py_UNICODE_ISLOWER(ch))
11140 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011142 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143}
11144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011145PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011146 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011148Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011149at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150
11151static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011152unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011154 Py_ssize_t i, length;
11155 int kind;
11156 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157 int cased;
11158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 if (PyUnicode_READY(self) == -1)
11160 return NULL;
11161 length = PyUnicode_GET_LENGTH(self);
11162 kind = PyUnicode_KIND(self);
11163 data = PyUnicode_DATA(self);
11164
Guido van Rossumd57fd912000-03-10 22:53:23 +000011165 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011166 if (length == 1)
11167 return PyBool_FromLong(
11168 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011169
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011170 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011172 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011173
Guido van Rossumd57fd912000-03-10 22:53:23 +000011174 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 for (i = 0; i < length; i++) {
11176 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011177
Benjamin Peterson29060642009-01-31 22:14:21 +000011178 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11179 return PyBool_FromLong(0);
11180 else if (!cased && Py_UNICODE_ISUPPER(ch))
11181 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011183 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184}
11185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011186PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011187 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011189Return True if S is a titlecased string and there is at least one\n\
11190character in S, i.e. upper- and titlecase characters may only\n\
11191follow uncased characters and lowercase characters only cased ones.\n\
11192Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193
11194static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011195unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011197 Py_ssize_t i, length;
11198 int kind;
11199 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200 int cased, previous_is_cased;
11201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011202 if (PyUnicode_READY(self) == -1)
11203 return NULL;
11204 length = PyUnicode_GET_LENGTH(self);
11205 kind = PyUnicode_KIND(self);
11206 data = PyUnicode_DATA(self);
11207
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011209 if (length == 1) {
11210 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11211 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11212 (Py_UNICODE_ISUPPER(ch) != 0));
11213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011215 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011217 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011218
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219 cased = 0;
11220 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011221 for (i = 0; i < length; i++) {
11222 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011223
Benjamin Peterson29060642009-01-31 22:14:21 +000011224 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11225 if (previous_is_cased)
11226 return PyBool_FromLong(0);
11227 previous_is_cased = 1;
11228 cased = 1;
11229 }
11230 else if (Py_UNICODE_ISLOWER(ch)) {
11231 if (!previous_is_cased)
11232 return PyBool_FromLong(0);
11233 previous_is_cased = 1;
11234 cased = 1;
11235 }
11236 else
11237 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011239 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240}
11241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011242PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011243 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011245Return True if all characters in S are whitespace\n\
11246and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247
11248static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011249unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011251 Py_ssize_t i, length;
11252 int kind;
11253 void *data;
11254
11255 if (PyUnicode_READY(self) == -1)
11256 return NULL;
11257 length = PyUnicode_GET_LENGTH(self);
11258 kind = PyUnicode_KIND(self);
11259 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011262 if (length == 1)
11263 return PyBool_FromLong(
11264 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011266 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011268 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270 for (i = 0; i < length; i++) {
11271 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011272 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011273 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011275 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276}
11277
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011278PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011279 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011280\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011281Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011282and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011283
11284static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011285unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 Py_ssize_t i, length;
11288 int kind;
11289 void *data;
11290
11291 if (PyUnicode_READY(self) == -1)
11292 return NULL;
11293 length = PyUnicode_GET_LENGTH(self);
11294 kind = PyUnicode_KIND(self);
11295 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011296
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011297 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011298 if (length == 1)
11299 return PyBool_FromLong(
11300 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011301
11302 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011303 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011304 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 for (i = 0; i < length; i++) {
11307 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011308 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011309 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011310 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011311}
11312
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011313PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011314 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011315\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011316Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011317and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011318
11319static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011320unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011321{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011322 int kind;
11323 void *data;
11324 Py_ssize_t len, i;
11325
11326 if (PyUnicode_READY(self) == -1)
11327 return NULL;
11328
11329 kind = PyUnicode_KIND(self);
11330 data = PyUnicode_DATA(self);
11331 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011332
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011333 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 if (len == 1) {
11335 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11336 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11337 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011338
11339 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011341 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 for (i = 0; i < len; i++) {
11344 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011345 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011346 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011347 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011348 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011349}
11350
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011351PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011352 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011354Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011355False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011356
11357static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011358unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011359{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 Py_ssize_t i, length;
11361 int kind;
11362 void *data;
11363
11364 if (PyUnicode_READY(self) == -1)
11365 return NULL;
11366 length = PyUnicode_GET_LENGTH(self);
11367 kind = PyUnicode_KIND(self);
11368 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011371 if (length == 1)
11372 return PyBool_FromLong(
11373 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011375 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011376 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011377 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011378
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 for (i = 0; i < length; i++) {
11380 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011381 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011383 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011384}
11385
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011386PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011387 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011388\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011389Return True if all characters in S are digits\n\
11390and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011391
11392static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011393unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011394{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011395 Py_ssize_t i, length;
11396 int kind;
11397 void *data;
11398
11399 if (PyUnicode_READY(self) == -1)
11400 return NULL;
11401 length = PyUnicode_GET_LENGTH(self);
11402 kind = PyUnicode_KIND(self);
11403 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011406 if (length == 1) {
11407 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11408 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011411 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011413 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415 for (i = 0; i < length; i++) {
11416 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011417 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011419 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420}
11421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011422PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011423 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011425Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011426False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427
11428static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011429unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431 Py_ssize_t i, length;
11432 int kind;
11433 void *data;
11434
11435 if (PyUnicode_READY(self) == -1)
11436 return NULL;
11437 length = PyUnicode_GET_LENGTH(self);
11438 kind = PyUnicode_KIND(self);
11439 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 if (length == 1)
11443 return PyBool_FromLong(
11444 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011446 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011448 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 for (i = 0; i < length; i++) {
11451 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011452 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011454 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455}
11456
Martin v. Löwis47383402007-08-15 07:32:56 +000011457int
11458PyUnicode_IsIdentifier(PyObject *self)
11459{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460 int kind;
11461 void *data;
11462 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011463 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011465 if (PyUnicode_READY(self) == -1) {
11466 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011467 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011468 }
11469
11470 /* Special case for empty strings */
11471 if (PyUnicode_GET_LENGTH(self) == 0)
11472 return 0;
11473 kind = PyUnicode_KIND(self);
11474 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011475
11476 /* PEP 3131 says that the first character must be in
11477 XID_Start and subsequent characters in XID_Continue,
11478 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011479 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011480 letters, digits, underscore). However, given the current
11481 definition of XID_Start and XID_Continue, it is sufficient
11482 to check just for these, except that _ must be allowed
11483 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011485 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011486 return 0;
11487
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011488 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011490 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011491 return 1;
11492}
11493
11494PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011495 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011496\n\
11497Return True if S is a valid identifier according\n\
11498to the language definition.");
11499
11500static PyObject*
11501unicode_isidentifier(PyObject *self)
11502{
11503 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11504}
11505
Georg Brandl559e5d72008-06-11 18:37:52 +000011506PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011508\n\
11509Return True if all characters in S are considered\n\
11510printable in repr() or S is empty, False otherwise.");
11511
11512static PyObject*
11513unicode_isprintable(PyObject *self)
11514{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515 Py_ssize_t i, length;
11516 int kind;
11517 void *data;
11518
11519 if (PyUnicode_READY(self) == -1)
11520 return NULL;
11521 length = PyUnicode_GET_LENGTH(self);
11522 kind = PyUnicode_KIND(self);
11523 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011524
11525 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526 if (length == 1)
11527 return PyBool_FromLong(
11528 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011530 for (i = 0; i < length; i++) {
11531 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011532 Py_RETURN_FALSE;
11533 }
11534 }
11535 Py_RETURN_TRUE;
11536}
11537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011538PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011539 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540\n\
11541Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011542iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543
11544static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011545unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011547 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548}
11549
Martin v. Löwis18e16552006-02-15 17:27:45 +000011550static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011551unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 if (PyUnicode_READY(self) == -1)
11554 return -1;
11555 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556}
11557
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011558PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011561Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011562done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563
11564static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011565unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011567 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011568 Py_UCS4 fillchar = ' ';
11569
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011570 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571 return NULL;
11572
Benjamin Petersonbac79492012-01-14 13:34:47 -050011573 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011574 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575
Victor Stinnerc4b49542011-12-11 22:44:26 +010011576 if (PyUnicode_GET_LENGTH(self) >= width)
11577 return unicode_result_unchanged(self);
11578
11579 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580}
11581
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011582PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011583 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011585Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586
11587static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011588unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011590 if (PyUnicode_READY(self) == -1)
11591 return NULL;
11592 if (PyUnicode_IS_ASCII(self))
11593 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011594 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595}
11596
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011597#define LEFTSTRIP 0
11598#define RIGHTSTRIP 1
11599#define BOTHSTRIP 2
11600
11601/* Arrays indexed by above */
11602static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11603
11604#define STRIPNAME(i) (stripformat[i]+3)
11605
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011606/* externally visible for str.strip(unicode) */
11607PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011608_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011609{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011610 void *data;
11611 int kind;
11612 Py_ssize_t i, j, len;
11613 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11616 return NULL;
11617
11618 kind = PyUnicode_KIND(self);
11619 data = PyUnicode_DATA(self);
11620 len = PyUnicode_GET_LENGTH(self);
11621 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11622 PyUnicode_DATA(sepobj),
11623 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011624
Benjamin Peterson14339b62009-01-31 16:36:08 +000011625 i = 0;
11626 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 while (i < len &&
11628 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011629 i++;
11630 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011631 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011632
Benjamin Peterson14339b62009-01-31 16:36:08 +000011633 j = len;
11634 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011635 do {
11636 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637 } while (j >= i &&
11638 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011639 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011640 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011641
Victor Stinner7931d9a2011-11-04 00:22:48 +010011642 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643}
11644
11645PyObject*
11646PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11647{
11648 unsigned char *data;
11649 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011650 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011651
Victor Stinnerde636f32011-10-01 03:55:54 +020011652 if (PyUnicode_READY(self) == -1)
11653 return NULL;
11654
Victor Stinner684d5fd2012-05-03 02:32:34 +020011655 length = PyUnicode_GET_LENGTH(self);
11656 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011657
Victor Stinner684d5fd2012-05-03 02:32:34 +020011658 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011659 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011660
Victor Stinnerde636f32011-10-01 03:55:54 +020011661 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011662 PyErr_SetString(PyExc_IndexError, "string index out of range");
11663 return NULL;
11664 }
Victor Stinner684d5fd2012-05-03 02:32:34 +020011665 if (start >= length || end < start) {
Victor Stinner3a7f79772012-05-03 03:36:40 +020011666 Py_INCREF(unicode_empty);
11667 return unicode_empty;
Victor Stinner684d5fd2012-05-03 02:32:34 +020011668 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020011669
Victor Stinner684d5fd2012-05-03 02:32:34 +020011670 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011671 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011672 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011673 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011674 }
11675 else {
11676 kind = PyUnicode_KIND(self);
11677 data = PyUnicode_1BYTE_DATA(self);
11678 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011679 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011680 length);
11681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683
11684static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011685do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 int kind;
11688 void *data;
11689 Py_ssize_t len, i, j;
11690
11691 if (PyUnicode_READY(self) == -1)
11692 return NULL;
11693
11694 kind = PyUnicode_KIND(self);
11695 data = PyUnicode_DATA(self);
11696 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011697
Benjamin Peterson14339b62009-01-31 16:36:08 +000011698 i = 0;
11699 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011700 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011701 i++;
11702 }
11703 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011704
Benjamin Peterson14339b62009-01-31 16:36:08 +000011705 j = len;
11706 if (striptype != LEFTSTRIP) {
11707 do {
11708 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011710 j++;
11711 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011712
Victor Stinner7931d9a2011-11-04 00:22:48 +010011713 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714}
11715
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011716
11717static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011718do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011719{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011720 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011721
Benjamin Peterson14339b62009-01-31 16:36:08 +000011722 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11723 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011724
Benjamin Peterson14339b62009-01-31 16:36:08 +000011725 if (sep != NULL && sep != Py_None) {
11726 if (PyUnicode_Check(sep))
11727 return _PyUnicode_XStrip(self, striptype, sep);
11728 else {
11729 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011730 "%s arg must be None or str",
11731 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011732 return NULL;
11733 }
11734 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011735
Benjamin Peterson14339b62009-01-31 16:36:08 +000011736 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011737}
11738
11739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011740PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011741 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011742\n\
11743Return a copy of the string S with leading and trailing\n\
11744whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011745If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011746
11747static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011748unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011749{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011750 if (PyTuple_GET_SIZE(args) == 0)
11751 return do_strip(self, BOTHSTRIP); /* Common case */
11752 else
11753 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011754}
11755
11756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011757PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011758 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011759\n\
11760Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011761If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011762
11763static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011764unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011765{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011766 if (PyTuple_GET_SIZE(args) == 0)
11767 return do_strip(self, LEFTSTRIP); /* Common case */
11768 else
11769 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011770}
11771
11772
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011773PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011775\n\
11776Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011777If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011778
11779static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011780unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011781{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011782 if (PyTuple_GET_SIZE(args) == 0)
11783 return do_strip(self, RIGHTSTRIP); /* Common case */
11784 else
11785 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011786}
11787
11788
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011790unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011791{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011792 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794
Georg Brandl222de0f2009-04-12 12:01:50 +000011795 if (len < 1) {
11796 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011797 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011798 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799
Victor Stinnerc4b49542011-12-11 22:44:26 +010011800 /* no repeat, return original string */
11801 if (len == 1)
11802 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011803
Benjamin Petersonbac79492012-01-14 13:34:47 -050011804 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 return NULL;
11806
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011807 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011808 PyErr_SetString(PyExc_OverflowError,
11809 "repeated string is too long");
11810 return NULL;
11811 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011812 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011813
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011814 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815 if (!u)
11816 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011817 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 if (PyUnicode_GET_LENGTH(str) == 1) {
11820 const int kind = PyUnicode_KIND(str);
11821 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011822 if (kind == PyUnicode_1BYTE_KIND) {
11823 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011824 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011825 }
11826 else if (kind == PyUnicode_2BYTE_KIND) {
11827 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011828 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011829 ucs2[n] = fill_char;
11830 } else {
11831 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11832 assert(kind == PyUnicode_4BYTE_KIND);
11833 for (n = 0; n < len; ++n)
11834 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011835 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011836 }
11837 else {
11838 /* number of characters copied this far */
11839 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011840 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 char *to = (char *) PyUnicode_DATA(u);
11842 Py_MEMCPY(to, PyUnicode_DATA(str),
11843 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011844 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 n = (done <= nchars-done) ? done : nchars-done;
11846 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011847 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849 }
11850
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011851 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011852 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853}
11854
Alexander Belopolsky40018472011-02-26 01:02:56 +000011855PyObject *
11856PyUnicode_Replace(PyObject *obj,
11857 PyObject *subobj,
11858 PyObject *replobj,
11859 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860{
11861 PyObject *self;
11862 PyObject *str1;
11863 PyObject *str2;
11864 PyObject *result;
11865
11866 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011867 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011868 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011870 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011871 Py_DECREF(self);
11872 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873 }
11874 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011875 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011876 Py_DECREF(self);
11877 Py_DECREF(str1);
11878 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011880 if (PyUnicode_READY(self) == -1 ||
11881 PyUnicode_READY(str1) == -1 ||
11882 PyUnicode_READY(str2) == -1)
11883 result = NULL;
11884 else
11885 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886 Py_DECREF(self);
11887 Py_DECREF(str1);
11888 Py_DECREF(str2);
11889 return result;
11890}
11891
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011892PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011893 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894\n\
11895Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011896old replaced by new. If the optional argument count is\n\
11897given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898
11899static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011902 PyObject *str1;
11903 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011904 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905 PyObject *result;
11906
Martin v. Löwis18e16552006-02-15 17:27:45 +000011907 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011909 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011910 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011912 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011913 return NULL;
11914 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011915 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011916 Py_DECREF(str1);
11917 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011918 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011919 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11920 result = NULL;
11921 else
11922 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923
11924 Py_DECREF(str1);
11925 Py_DECREF(str2);
11926 return result;
11927}
11928
Alexander Belopolsky40018472011-02-26 01:02:56 +000011929static PyObject *
11930unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011932 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 Py_ssize_t isize;
11934 Py_ssize_t osize, squote, dquote, i, o;
11935 Py_UCS4 max, quote;
11936 int ikind, okind;
11937 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011940 return NULL;
11941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 isize = PyUnicode_GET_LENGTH(unicode);
11943 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011944
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 /* Compute length of output, quote characters, and
11946 maximum character */
11947 osize = 2; /* quotes */
11948 max = 127;
11949 squote = dquote = 0;
11950 ikind = PyUnicode_KIND(unicode);
11951 for (i = 0; i < isize; i++) {
11952 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11953 switch (ch) {
11954 case '\'': squote++; osize++; break;
11955 case '"': dquote++; osize++; break;
11956 case '\\': case '\t': case '\r': case '\n':
11957 osize += 2; break;
11958 default:
11959 /* Fast-path ASCII */
11960 if (ch < ' ' || ch == 0x7f)
11961 osize += 4; /* \xHH */
11962 else if (ch < 0x7f)
11963 osize++;
11964 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11965 osize++;
11966 max = ch > max ? ch : max;
11967 }
11968 else if (ch < 0x100)
11969 osize += 4; /* \xHH */
11970 else if (ch < 0x10000)
11971 osize += 6; /* \uHHHH */
11972 else
11973 osize += 10; /* \uHHHHHHHH */
11974 }
11975 }
11976
11977 quote = '\'';
11978 if (squote) {
11979 if (dquote)
11980 /* Both squote and dquote present. Use squote,
11981 and escape them */
11982 osize += squote;
11983 else
11984 quote = '"';
11985 }
11986
11987 repr = PyUnicode_New(osize, max);
11988 if (repr == NULL)
11989 return NULL;
11990 okind = PyUnicode_KIND(repr);
11991 odata = PyUnicode_DATA(repr);
11992
11993 PyUnicode_WRITE(okind, odata, 0, quote);
11994 PyUnicode_WRITE(okind, odata, osize-1, quote);
11995
11996 for (i = 0, o = 1; i < isize; i++) {
11997 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011998
11999 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 if ((ch == quote) || (ch == '\\')) {
12001 PyUnicode_WRITE(okind, odata, o++, '\\');
12002 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012003 continue;
12004 }
12005
Benjamin Peterson29060642009-01-31 22:14:21 +000012006 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012007 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 PyUnicode_WRITE(okind, odata, o++, '\\');
12009 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012010 }
12011 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 PyUnicode_WRITE(okind, odata, o++, '\\');
12013 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012014 }
12015 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 PyUnicode_WRITE(okind, odata, o++, '\\');
12017 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012018 }
12019
12020 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012021 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012022 PyUnicode_WRITE(okind, odata, o++, '\\');
12023 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012024 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12025 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012026 }
12027
Georg Brandl559e5d72008-06-11 18:37:52 +000012028 /* Copy ASCII characters as-is */
12029 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012031 }
12032
Benjamin Peterson29060642009-01-31 22:14:21 +000012033 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012034 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012035 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012036 (categories Z* and C* except ASCII space)
12037 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012039 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000012040 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012042 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012043 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12044 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012045 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012046 /* Map 16-bit characters to '\uxxxx' */
12047 else if (ch <= 0xffff) {
12048 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012049 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12050 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12051 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12052 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012053 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012054 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012055 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012056 PyUnicode_WRITE(okind, odata, o++, 'U');
12057 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12058 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12059 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12060 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020012061 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12062 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12063 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12064 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012065 }
12066 }
12067 /* Copy characters as-is */
12068 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012070 }
12071 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012072 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012074 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012075 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076}
12077
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012078PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012079 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080\n\
12081Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012082such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083arguments start and end are interpreted as in slice notation.\n\
12084\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012085Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086
12087static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012090 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012091 Py_ssize_t start;
12092 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012093 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094
Jesus Ceaac451502011-04-20 17:09:23 +020012095 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12096 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012097 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 if (PyUnicode_READY(self) == -1)
12100 return NULL;
12101 if (PyUnicode_READY(substring) == -1)
12102 return NULL;
12103
Victor Stinner7931d9a2011-11-04 00:22:48 +010012104 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105
12106 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 if (result == -2)
12109 return NULL;
12110
Christian Heimes217cfd12007-12-02 14:31:20 +000012111 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112}
12113
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012114PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012115 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012117Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118
12119static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012120unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012121{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012122 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012123 Py_ssize_t start;
12124 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012125 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126
Jesus Ceaac451502011-04-20 17:09:23 +020012127 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12128 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012129 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 if (PyUnicode_READY(self) == -1)
12132 return NULL;
12133 if (PyUnicode_READY(substring) == -1)
12134 return NULL;
12135
Victor Stinner7931d9a2011-11-04 00:22:48 +010012136 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137
12138 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 if (result == -2)
12141 return NULL;
12142
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143 if (result < 0) {
12144 PyErr_SetString(PyExc_ValueError, "substring not found");
12145 return NULL;
12146 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012147
Christian Heimes217cfd12007-12-02 14:31:20 +000012148 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149}
12150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012151PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012152 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012154Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012155done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156
12157static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012158unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012160 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 Py_UCS4 fillchar = ' ';
12162
Victor Stinnere9a29352011-10-01 02:14:59 +020012163 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012165
Benjamin Petersonbac79492012-01-14 13:34:47 -050012166 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167 return NULL;
12168
Victor Stinnerc4b49542011-12-11 22:44:26 +010012169 if (PyUnicode_GET_LENGTH(self) >= width)
12170 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171
Victor Stinnerc4b49542011-12-11 22:44:26 +010012172 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173}
12174
Alexander Belopolsky40018472011-02-26 01:02:56 +000012175PyObject *
12176PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177{
12178 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012179
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180 s = PyUnicode_FromObject(s);
12181 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012182 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012183 if (sep != NULL) {
12184 sep = PyUnicode_FromObject(sep);
12185 if (sep == NULL) {
12186 Py_DECREF(s);
12187 return NULL;
12188 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189 }
12190
Victor Stinner9310abb2011-10-05 00:59:23 +020012191 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192
12193 Py_DECREF(s);
12194 Py_XDECREF(sep);
12195 return result;
12196}
12197
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012198PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012199 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200\n\
12201Return a list of the words in S, using sep as the\n\
12202delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012203splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012204whitespace string is a separator and empty strings are\n\
12205removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206
12207static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012208unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012210 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012212 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012214 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12215 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216 return NULL;
12217
12218 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012219 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012221 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012223 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224}
12225
Thomas Wouters477c8d52006-05-27 19:21:47 +000012226PyObject *
12227PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12228{
12229 PyObject* str_obj;
12230 PyObject* sep_obj;
12231 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012232 int kind1, kind2, kind;
12233 void *buf1 = NULL, *buf2 = NULL;
12234 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012235
12236 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012237 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012238 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012239 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012240 if (!sep_obj) {
12241 Py_DECREF(str_obj);
12242 return NULL;
12243 }
12244 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12245 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012246 Py_DECREF(str_obj);
12247 return NULL;
12248 }
12249
Victor Stinner14f8f022011-10-05 20:58:25 +020012250 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012251 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012252 kind = Py_MAX(kind1, kind2);
12253 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012255 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012256 if (!buf1)
12257 goto onError;
12258 buf2 = PyUnicode_DATA(sep_obj);
12259 if (kind2 != kind)
12260 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12261 if (!buf2)
12262 goto onError;
12263 len1 = PyUnicode_GET_LENGTH(str_obj);
12264 len2 = PyUnicode_GET_LENGTH(sep_obj);
12265
Benjamin Petersonead6b532011-12-20 17:23:42 -060012266 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012267 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012268 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12269 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12270 else
12271 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012272 break;
12273 case PyUnicode_2BYTE_KIND:
12274 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12275 break;
12276 case PyUnicode_4BYTE_KIND:
12277 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12278 break;
12279 default:
12280 assert(0);
12281 out = 0;
12282 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012283
12284 Py_DECREF(sep_obj);
12285 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 if (kind1 != kind)
12287 PyMem_Free(buf1);
12288 if (kind2 != kind)
12289 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012290
12291 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 onError:
12293 Py_DECREF(sep_obj);
12294 Py_DECREF(str_obj);
12295 if (kind1 != kind && buf1)
12296 PyMem_Free(buf1);
12297 if (kind2 != kind && buf2)
12298 PyMem_Free(buf2);
12299 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012300}
12301
12302
12303PyObject *
12304PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12305{
12306 PyObject* str_obj;
12307 PyObject* sep_obj;
12308 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012309 int kind1, kind2, kind;
12310 void *buf1 = NULL, *buf2 = NULL;
12311 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012312
12313 str_obj = PyUnicode_FromObject(str_in);
12314 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012315 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012316 sep_obj = PyUnicode_FromObject(sep_in);
12317 if (!sep_obj) {
12318 Py_DECREF(str_obj);
12319 return NULL;
12320 }
12321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 kind1 = PyUnicode_KIND(str_in);
12323 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012324 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 buf1 = PyUnicode_DATA(str_in);
12326 if (kind1 != kind)
12327 buf1 = _PyUnicode_AsKind(str_in, kind);
12328 if (!buf1)
12329 goto onError;
12330 buf2 = PyUnicode_DATA(sep_obj);
12331 if (kind2 != kind)
12332 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12333 if (!buf2)
12334 goto onError;
12335 len1 = PyUnicode_GET_LENGTH(str_obj);
12336 len2 = PyUnicode_GET_LENGTH(sep_obj);
12337
Benjamin Petersonead6b532011-12-20 17:23:42 -060012338 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012340 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12341 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12342 else
12343 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 break;
12345 case PyUnicode_2BYTE_KIND:
12346 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12347 break;
12348 case PyUnicode_4BYTE_KIND:
12349 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12350 break;
12351 default:
12352 assert(0);
12353 out = 0;
12354 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012355
12356 Py_DECREF(sep_obj);
12357 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 if (kind1 != kind)
12359 PyMem_Free(buf1);
12360 if (kind2 != kind)
12361 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012362
12363 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364 onError:
12365 Py_DECREF(sep_obj);
12366 Py_DECREF(str_obj);
12367 if (kind1 != kind && buf1)
12368 PyMem_Free(buf1);
12369 if (kind2 != kind && buf2)
12370 PyMem_Free(buf2);
12371 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012372}
12373
12374PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012376\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012377Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012378the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012379found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012380
12381static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012382unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012383{
Victor Stinner9310abb2011-10-05 00:59:23 +020012384 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012385}
12386
12387PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012388 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012389\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012390Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012391the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012392separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012393
12394static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012395unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012396{
Victor Stinner9310abb2011-10-05 00:59:23 +020012397 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012398}
12399
Alexander Belopolsky40018472011-02-26 01:02:56 +000012400PyObject *
12401PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012402{
12403 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012404
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012405 s = PyUnicode_FromObject(s);
12406 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012407 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012408 if (sep != NULL) {
12409 sep = PyUnicode_FromObject(sep);
12410 if (sep == NULL) {
12411 Py_DECREF(s);
12412 return NULL;
12413 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012414 }
12415
Victor Stinner9310abb2011-10-05 00:59:23 +020012416 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012417
12418 Py_DECREF(s);
12419 Py_XDECREF(sep);
12420 return result;
12421}
12422
12423PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012424 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012425\n\
12426Return a list of the words in S, using sep as the\n\
12427delimiter string, starting at the end of the string and\n\
12428working to the front. If maxsplit is given, at most maxsplit\n\
12429splits are done. If sep is not specified, any whitespace string\n\
12430is a separator.");
12431
12432static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012433unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012434{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012435 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012436 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012437 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012438
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012439 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12440 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012441 return NULL;
12442
12443 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012444 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012445 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012446 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012447 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012448 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012449}
12450
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012451PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012452 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012453\n\
12454Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012455Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012456is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457
12458static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012459unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012460{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012461 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012462 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012464 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12465 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466 return NULL;
12467
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012468 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012469}
12470
12471static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012472PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012473{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012474 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475}
12476
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012477PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012478 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012479\n\
12480Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012481and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012482
12483static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012484unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012486 if (PyUnicode_READY(self) == -1)
12487 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012488 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489}
12490
Georg Brandlceee0772007-11-27 23:48:05 +000012491PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012492 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012493\n\
12494Return a translation table usable for str.translate().\n\
12495If there is only one argument, it must be a dictionary mapping Unicode\n\
12496ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012497Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012498If there are two arguments, they must be strings of equal length, and\n\
12499in the resulting dictionary, each character in x will be mapped to the\n\
12500character at the same position in y. If there is a third argument, it\n\
12501must be a string, whose characters will be mapped to None in the result.");
12502
12503static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012504unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012505{
12506 PyObject *x, *y = NULL, *z = NULL;
12507 PyObject *new = NULL, *key, *value;
12508 Py_ssize_t i = 0;
12509 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012510
Georg Brandlceee0772007-11-27 23:48:05 +000012511 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12512 return NULL;
12513 new = PyDict_New();
12514 if (!new)
12515 return NULL;
12516 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012517 int x_kind, y_kind, z_kind;
12518 void *x_data, *y_data, *z_data;
12519
Georg Brandlceee0772007-11-27 23:48:05 +000012520 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012521 if (!PyUnicode_Check(x)) {
12522 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12523 "be a string if there is a second argument");
12524 goto err;
12525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012527 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12528 "arguments must have equal length");
12529 goto err;
12530 }
12531 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012532 x_kind = PyUnicode_KIND(x);
12533 y_kind = PyUnicode_KIND(y);
12534 x_data = PyUnicode_DATA(x);
12535 y_data = PyUnicode_DATA(y);
12536 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12537 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012538 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012539 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012540 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012541 if (!value) {
12542 Py_DECREF(key);
12543 goto err;
12544 }
Georg Brandlceee0772007-11-27 23:48:05 +000012545 res = PyDict_SetItem(new, key, value);
12546 Py_DECREF(key);
12547 Py_DECREF(value);
12548 if (res < 0)
12549 goto err;
12550 }
12551 /* create entries for deleting chars in z */
12552 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 z_kind = PyUnicode_KIND(z);
12554 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012555 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012557 if (!key)
12558 goto err;
12559 res = PyDict_SetItem(new, key, Py_None);
12560 Py_DECREF(key);
12561 if (res < 0)
12562 goto err;
12563 }
12564 }
12565 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566 int kind;
12567 void *data;
12568
Georg Brandlceee0772007-11-27 23:48:05 +000012569 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012570 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012571 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12572 "to maketrans it must be a dict");
12573 goto err;
12574 }
12575 /* copy entries into the new dict, converting string keys to int keys */
12576 while (PyDict_Next(x, &i, &key, &value)) {
12577 if (PyUnicode_Check(key)) {
12578 /* convert string keys to integer keys */
12579 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012580 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012581 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12582 "table must be of length 1");
12583 goto err;
12584 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585 kind = PyUnicode_KIND(key);
12586 data = PyUnicode_DATA(key);
12587 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012588 if (!newkey)
12589 goto err;
12590 res = PyDict_SetItem(new, newkey, value);
12591 Py_DECREF(newkey);
12592 if (res < 0)
12593 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012594 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012595 /* just keep integer keys */
12596 if (PyDict_SetItem(new, key, value) < 0)
12597 goto err;
12598 } else {
12599 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12600 "be strings or integers");
12601 goto err;
12602 }
12603 }
12604 }
12605 return new;
12606 err:
12607 Py_DECREF(new);
12608 return NULL;
12609}
12610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012611PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012612 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613\n\
12614Return a copy of the string S, where all characters have been mapped\n\
12615through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012616Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012617Unmapped characters are left untouched. Characters mapped to None\n\
12618are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619
12620static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012621unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624}
12625
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012626PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012627 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012629Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630
12631static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012632unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012634 if (PyUnicode_READY(self) == -1)
12635 return NULL;
12636 if (PyUnicode_IS_ASCII(self))
12637 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012638 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639}
12640
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012641PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012642 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012644Pad a numeric string S with zeros on the left, to fill a field\n\
12645of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646
12647static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012648unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012649{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012650 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012651 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012652 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 int kind;
12654 void *data;
12655 Py_UCS4 chr;
12656
Martin v. Löwis18e16552006-02-15 17:27:45 +000012657 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658 return NULL;
12659
Benjamin Petersonbac79492012-01-14 13:34:47 -050012660 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012661 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662
Victor Stinnerc4b49542011-12-11 22:44:26 +010012663 if (PyUnicode_GET_LENGTH(self) >= width)
12664 return unicode_result_unchanged(self);
12665
12666 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667
12668 u = pad(self, fill, 0, '0');
12669
Walter Dörwald068325e2002-04-15 13:36:47 +000012670 if (u == NULL)
12671 return NULL;
12672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 kind = PyUnicode_KIND(u);
12674 data = PyUnicode_DATA(u);
12675 chr = PyUnicode_READ(kind, data, fill);
12676
12677 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012679 PyUnicode_WRITE(kind, data, 0, chr);
12680 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012681 }
12682
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012683 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012684 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686
12687#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012688static PyObject *
12689unicode__decimal2ascii(PyObject *self)
12690{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012692}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693#endif
12694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012695PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012696 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012698Return True if S starts with the specified prefix, False otherwise.\n\
12699With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012700With optional end, stop comparing S at that position.\n\
12701prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012702
12703static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012704unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012705 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012707 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012708 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012709 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012710 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012711 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012712
Jesus Ceaac451502011-04-20 17:09:23 +020012713 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012714 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012715 if (PyTuple_Check(subobj)) {
12716 Py_ssize_t i;
12717 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012718 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012719 if (substring == NULL)
12720 return NULL;
12721 result = tailmatch(self, substring, start, end, -1);
12722 Py_DECREF(substring);
12723 if (result) {
12724 Py_RETURN_TRUE;
12725 }
12726 }
12727 /* nothing matched */
12728 Py_RETURN_FALSE;
12729 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012730 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012731 if (substring == NULL) {
12732 if (PyErr_ExceptionMatches(PyExc_TypeError))
12733 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12734 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012735 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012736 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012737 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012738 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012739 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740}
12741
12742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012743PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012744 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012746Return True if S ends with the specified suffix, False otherwise.\n\
12747With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012748With optional end, stop comparing S at that position.\n\
12749suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012750
12751static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012752unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012753 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012754{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012755 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012756 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012757 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012758 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012759 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012760
Jesus Ceaac451502011-04-20 17:09:23 +020012761 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012762 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012763 if (PyTuple_Check(subobj)) {
12764 Py_ssize_t i;
12765 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012766 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012767 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012768 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012769 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012770 result = tailmatch(self, substring, start, end, +1);
12771 Py_DECREF(substring);
12772 if (result) {
12773 Py_RETURN_TRUE;
12774 }
12775 }
12776 Py_RETURN_FALSE;
12777 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012778 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012779 if (substring == NULL) {
12780 if (PyErr_ExceptionMatches(PyExc_TypeError))
12781 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12782 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012783 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012784 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012785 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012786 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012787 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012788}
12789
Victor Stinner202fdca2012-05-07 12:47:02 +020012790Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012791_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012792{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012793 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012794 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12795 writer->data = PyUnicode_DATA(writer->buffer);
12796 writer->kind = PyUnicode_KIND(writer->buffer);
12797}
12798
Victor Stinnerd3f08822012-05-29 12:57:52 +020012799void
12800_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012801{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012802 memset(writer, 0, sizeof(*writer));
12803#ifdef Py_DEBUG
12804 writer->kind = 5; /* invalid kind */
12805#endif
12806 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012807 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012808}
12809
Victor Stinnerd3f08822012-05-29 12:57:52 +020012810int
12811_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12812 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012813{
12814 Py_ssize_t newlen;
12815 PyObject *newbuffer;
12816
Victor Stinnerd3f08822012-05-29 12:57:52 +020012817 assert(length > 0);
12818
Victor Stinner202fdca2012-05-07 12:47:02 +020012819 if (length > PY_SSIZE_T_MAX - writer->pos) {
12820 PyErr_NoMemory();
12821 return -1;
12822 }
12823 newlen = writer->pos + length;
12824
Victor Stinnerd3f08822012-05-29 12:57:52 +020012825 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012826 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012827 /* overallocate 25% to limit the number of resize */
12828 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12829 newlen += newlen / 4;
12830 if (newlen < writer->min_length)
12831 newlen = writer->min_length;
12832 }
12833 writer->buffer = PyUnicode_New(newlen, maxchar);
12834 if (writer->buffer == NULL)
12835 return -1;
12836 _PyUnicodeWriter_Update(writer);
12837 return 0;
12838 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012839
Victor Stinnerd3f08822012-05-29 12:57:52 +020012840 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012841 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012842 /* overallocate 25% to limit the number of resize */
12843 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12844 newlen += newlen / 4;
12845 if (newlen < writer->min_length)
12846 newlen = writer->min_length;
12847 }
12848
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012849 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012850 /* resize + widen */
12851 newbuffer = PyUnicode_New(newlen, maxchar);
12852 if (newbuffer == NULL)
12853 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012854 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12855 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012856 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012857 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012858 }
12859 else {
12860 newbuffer = resize_compact(writer->buffer, newlen);
12861 if (newbuffer == NULL)
12862 return -1;
12863 }
12864 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012865 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012866 }
12867 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012868 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012869 newbuffer = PyUnicode_New(writer->size, maxchar);
12870 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012871 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012872 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12873 writer->buffer, 0, writer->pos);
12874 Py_DECREF(writer->buffer);
12875 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012876 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012877 }
12878 return 0;
12879}
12880
Victor Stinnerd3f08822012-05-29 12:57:52 +020012881int
12882_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12883{
12884 Py_UCS4 maxchar;
12885 Py_ssize_t len;
12886
12887 if (PyUnicode_READY(str) == -1)
12888 return -1;
12889 len = PyUnicode_GET_LENGTH(str);
12890 if (len == 0)
12891 return 0;
12892 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12893 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012894 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012895 Py_INCREF(str);
12896 writer->buffer = str;
12897 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012898 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012899 writer->size = 0;
12900 writer->pos += len;
12901 return 0;
12902 }
12903 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12904 return -1;
12905 }
12906 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12907 str, 0, len);
12908 writer->pos += len;
12909 return 0;
12910}
12911
12912PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012913_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012914{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012915 if (writer->pos == 0) {
12916 Py_XDECREF(writer->buffer);
12917 Py_INCREF(unicode_empty);
12918 return unicode_empty;
12919 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012920 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012921 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12922 return writer->buffer;
12923 }
12924 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12925 PyObject *newbuffer;
12926 newbuffer = resize_compact(writer->buffer, writer->pos);
12927 if (newbuffer == NULL) {
12928 Py_DECREF(writer->buffer);
12929 return NULL;
12930 }
12931 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012932 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012933 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012934 return writer->buffer;
12935}
12936
Victor Stinnerd3f08822012-05-29 12:57:52 +020012937void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012938_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012939{
12940 Py_CLEAR(writer->buffer);
12941}
12942
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012943#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012944
12945PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012946 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012947\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012948Return a formatted version of S, using substitutions from args and kwargs.\n\
12949The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012950
Eric Smith27bbca62010-11-04 17:06:58 +000012951PyDoc_STRVAR(format_map__doc__,
12952 "S.format_map(mapping) -> str\n\
12953\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012954Return a formatted version of S, using substitutions from mapping.\n\
12955The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012956
Eric Smith4a7d76d2008-05-30 18:10:19 +000012957static PyObject *
12958unicode__format__(PyObject* self, PyObject* args)
12959{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012960 PyObject *format_spec;
12961 _PyUnicodeWriter writer;
12962 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012963
12964 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12965 return NULL;
12966
Victor Stinnerd3f08822012-05-29 12:57:52 +020012967 if (PyUnicode_READY(self) == -1)
12968 return NULL;
12969 _PyUnicodeWriter_Init(&writer, 0);
12970 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12971 self, format_spec, 0,
12972 PyUnicode_GET_LENGTH(format_spec));
12973 if (ret == -1) {
12974 _PyUnicodeWriter_Dealloc(&writer);
12975 return NULL;
12976 }
12977 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012978}
12979
Eric Smith8c663262007-08-25 02:26:07 +000012980PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012981 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012982\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012983Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012984
12985static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012986unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012987{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012988 Py_ssize_t size;
12989
12990 /* If it's a compact object, account for base structure +
12991 character data. */
12992 if (PyUnicode_IS_COMPACT_ASCII(v))
12993 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12994 else if (PyUnicode_IS_COMPACT(v))
12995 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012996 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012997 else {
12998 /* If it is a two-block object, account for base object, and
12999 for character block if present. */
13000 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013001 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013002 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013003 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004 }
13005 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013006 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013007 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013008 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013009 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013010 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011
13012 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013013}
13014
13015PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013016 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013017
13018static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013019unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013020{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013021 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013022 if (!copy)
13023 return NULL;
13024 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013025}
13026
Guido van Rossumd57fd912000-03-10 22:53:23 +000013027static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013028 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013029 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013030 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13031 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013032 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13033 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013034 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013035 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13036 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13037 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13038 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13039 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013040 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013041 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13042 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13043 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013044 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013045 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13046 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13047 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013048 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013049 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013050 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013051 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013052 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13053 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13054 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13055 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13056 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13057 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13058 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13059 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13060 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13061 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13062 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13063 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13064 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13065 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013066 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013067 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013068 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013069 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013070 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013071 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013072 {"maketrans", (PyCFunction) unicode_maketrans,
13073 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013074 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013075#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013076 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013077 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013078#endif
13079
Benjamin Peterson14339b62009-01-31 16:36:08 +000013080 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013081 {NULL, NULL}
13082};
13083
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013084static PyObject *
13085unicode_mod(PyObject *v, PyObject *w)
13086{
Brian Curtindfc80e32011-08-10 20:28:54 -050013087 if (!PyUnicode_Check(v))
13088 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013089 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013090}
13091
13092static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013093 0, /*nb_add*/
13094 0, /*nb_subtract*/
13095 0, /*nb_multiply*/
13096 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013097};
13098
Guido van Rossumd57fd912000-03-10 22:53:23 +000013099static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013100 (lenfunc) unicode_length, /* sq_length */
13101 PyUnicode_Concat, /* sq_concat */
13102 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13103 (ssizeargfunc) unicode_getitem, /* sq_item */
13104 0, /* sq_slice */
13105 0, /* sq_ass_item */
13106 0, /* sq_ass_slice */
13107 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108};
13109
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013110static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013111unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013112{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013113 if (PyUnicode_READY(self) == -1)
13114 return NULL;
13115
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013116 if (PyIndex_Check(item)) {
13117 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013118 if (i == -1 && PyErr_Occurred())
13119 return NULL;
13120 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013121 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013122 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013123 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013124 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013125 PyObject *result;
13126 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013127 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013128 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013131 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013132 return NULL;
13133 }
13134
13135 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010013136 Py_INCREF(unicode_empty);
13137 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013139 slicelength == PyUnicode_GET_LENGTH(self)) {
13140 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013141 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013142 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013143 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013144 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013145 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013146 src_kind = PyUnicode_KIND(self);
13147 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013148 if (!PyUnicode_IS_ASCII(self)) {
13149 kind_limit = kind_maxchar_limit(src_kind);
13150 max_char = 0;
13151 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13152 ch = PyUnicode_READ(src_kind, src_data, cur);
13153 if (ch > max_char) {
13154 max_char = ch;
13155 if (max_char >= kind_limit)
13156 break;
13157 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013158 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013159 }
Victor Stinner55c99112011-10-13 01:17:06 +020013160 else
13161 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013162 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013163 if (result == NULL)
13164 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013165 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013166 dest_data = PyUnicode_DATA(result);
13167
13168 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013169 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13170 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013171 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013172 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013173 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013174 } else {
13175 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13176 return NULL;
13177 }
13178}
13179
13180static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013181 (lenfunc)unicode_length, /* mp_length */
13182 (binaryfunc)unicode_subscript, /* mp_subscript */
13183 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013184};
13185
Guido van Rossumd57fd912000-03-10 22:53:23 +000013186
Guido van Rossumd57fd912000-03-10 22:53:23 +000013187/* Helpers for PyUnicode_Format() */
13188
13189static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013190getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013191{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013192 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013193 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013194 (*p_argidx)++;
13195 if (arglen < 0)
13196 return args;
13197 else
13198 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199 }
13200 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013201 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202 return NULL;
13203}
13204
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013205/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013206
Victor Stinnerd3f08822012-05-29 12:57:52 +020013207static int
13208formatfloat(PyObject *v, int flags, int prec, int type,
13209 PyObject **p_output, _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013210{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013211 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013212 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013213 Py_ssize_t len;
Tim Petersced69f82003-09-16 20:30:58 +000013214
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215 x = PyFloat_AsDouble(v);
13216 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013217 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013218
Guido van Rossumd57fd912000-03-10 22:53:23 +000013219 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013220 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013221
Eric Smith0923d1d2009-04-16 20:16:10 +000013222 p = PyOS_double_to_string(x, type, prec,
13223 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013224 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013225 return -1;
13226 len = strlen(p);
13227 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013228 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13229 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013230 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013231 }
Victor Stinner184252a2012-06-16 02:57:41 +020013232 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013233 writer->pos += len;
13234 }
13235 else
13236 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013237 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013238 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013239}
13240
Victor Stinnerd0880d52012-04-27 23:40:13 +020013241/* formatlong() emulates the format codes d, u, o, x and X, and
13242 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13243 * Python's regular ints.
13244 * Return value: a new PyUnicodeObject*, or NULL if error.
13245 * The output string is of the form
13246 * "-"? ("0x" | "0X")? digit+
13247 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13248 * set in flags. The case of hex digits will be correct,
13249 * There will be at least prec digits, zero-filled on the left if
13250 * necessary to get that many.
13251 * val object to be converted
13252 * flags bitmask of format flags; only F_ALT is looked at
13253 * prec minimum number of digits; 0-fill on left if needed
13254 * type a character in [duoxX]; u acts the same as d
13255 *
13256 * CAUTION: o, x and X conversions on regular ints can never
13257 * produce a '-' sign, but can for Python's unbounded ints.
13258 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013259static PyObject*
13260formatlong(PyObject *val, int flags, int prec, int type)
13261{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013262 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013263 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013264 Py_ssize_t i;
13265 int sign; /* 1 if '-', else 0 */
13266 int len; /* number of characters */
13267 Py_ssize_t llen;
13268 int numdigits; /* len == numnondigits + numdigits */
13269 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013270
Victor Stinnerd0880d52012-04-27 23:40:13 +020013271 /* Avoid exceeding SSIZE_T_MAX */
13272 if (prec > INT_MAX-3) {
13273 PyErr_SetString(PyExc_OverflowError,
13274 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013275 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013276 }
13277
13278 assert(PyLong_Check(val));
13279
13280 switch (type) {
13281 case 'd':
13282 case 'u':
13283 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013284 if (PyBool_Check(val))
13285 result = PyNumber_ToBase(val, 10);
13286 else
13287 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013288 break;
13289 case 'o':
13290 numnondigits = 2;
13291 result = PyNumber_ToBase(val, 8);
13292 break;
13293 case 'x':
13294 case 'X':
13295 numnondigits = 2;
13296 result = PyNumber_ToBase(val, 16);
13297 break;
13298 default:
13299 assert(!"'type' not in [duoxX]");
13300 }
13301 if (!result)
13302 return NULL;
13303
13304 assert(unicode_modifiable(result));
13305 assert(PyUnicode_IS_READY(result));
13306 assert(PyUnicode_IS_ASCII(result));
13307
13308 /* To modify the string in-place, there can only be one reference. */
13309 if (Py_REFCNT(result) != 1) {
13310 PyErr_BadInternalCall();
13311 return NULL;
13312 }
13313 buf = PyUnicode_DATA(result);
13314 llen = PyUnicode_GET_LENGTH(result);
13315 if (llen > INT_MAX) {
13316 PyErr_SetString(PyExc_ValueError,
13317 "string too large in _PyBytes_FormatLong");
13318 return NULL;
13319 }
13320 len = (int)llen;
13321 sign = buf[0] == '-';
13322 numnondigits += sign;
13323 numdigits = len - numnondigits;
13324 assert(numdigits > 0);
13325
13326 /* Get rid of base marker unless F_ALT */
13327 if (((flags & F_ALT) == 0 &&
13328 (type == 'o' || type == 'x' || type == 'X'))) {
13329 assert(buf[sign] == '0');
13330 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13331 buf[sign+1] == 'o');
13332 numnondigits -= 2;
13333 buf += 2;
13334 len -= 2;
13335 if (sign)
13336 buf[0] = '-';
13337 assert(len == numnondigits + numdigits);
13338 assert(numdigits > 0);
13339 }
13340
13341 /* Fill with leading zeroes to meet minimum width. */
13342 if (prec > numdigits) {
13343 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13344 numnondigits + prec);
13345 char *b1;
13346 if (!r1) {
13347 Py_DECREF(result);
13348 return NULL;
13349 }
13350 b1 = PyBytes_AS_STRING(r1);
13351 for (i = 0; i < numnondigits; ++i)
13352 *b1++ = *buf++;
13353 for (i = 0; i < prec - numdigits; i++)
13354 *b1++ = '0';
13355 for (i = 0; i < numdigits; i++)
13356 *b1++ = *buf++;
13357 *b1 = '\0';
13358 Py_DECREF(result);
13359 result = r1;
13360 buf = PyBytes_AS_STRING(result);
13361 len = numnondigits + prec;
13362 }
13363
13364 /* Fix up case for hex conversions. */
13365 if (type == 'X') {
13366 /* Need to convert all lower case letters to upper case.
13367 and need to convert 0x to 0X (and -0x to -0X). */
13368 for (i = 0; i < len; i++)
13369 if (buf[i] >= 'a' && buf[i] <= 'x')
13370 buf[i] -= 'a'-'A';
13371 }
13372 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13373 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013374 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013375 Py_DECREF(result);
13376 result = unicode;
13377 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013378 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013379}
13380
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013381static Py_UCS4
13382formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013383{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013384 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013385 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013386 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013387 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013388 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013389 goto onError;
13390 }
13391 else {
13392 /* Integer input truncated to a character */
13393 long x;
13394 x = PyLong_AsLong(v);
13395 if (x == -1 && PyErr_Occurred())
13396 goto onError;
13397
Victor Stinner8faf8212011-12-08 22:14:11 +010013398 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 PyErr_SetString(PyExc_OverflowError,
13400 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013401 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013402 }
13403
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013404 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013405 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013406
Benjamin Peterson29060642009-01-31 22:14:21 +000013407 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013408 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013409 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013410 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013411}
13412
Alexander Belopolsky40018472011-02-26 01:02:56 +000013413PyObject *
13414PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013415{
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013416 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013417 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013418 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013419 PyObject *temp = NULL;
13420 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013421 PyObject *uformat;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013422 void *fmt;
13423 enum PyUnicode_Kind kind, fmtkind;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013424 _PyUnicodeWriter writer;
Victor Stinneree4544c2012-05-09 22:24:08 +020013425 Py_ssize_t sublen;
13426 Py_UCS4 maxchar;
Tim Petersced69f82003-09-16 20:30:58 +000013427
Guido van Rossumd57fd912000-03-10 22:53:23 +000013428 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013429 PyErr_BadInternalCall();
13430 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013431 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013432 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013433 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013434 return NULL;
Victor Stinner19294072012-10-05 00:09:33 +020013435 if (PyUnicode_READY(uformat) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013436 Py_DECREF(uformat);
Victor Stinner19294072012-10-05 00:09:33 +020013437 return NULL;
13438 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013439
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013440 fmt = PyUnicode_DATA(uformat);
13441 fmtkind = PyUnicode_KIND(uformat);
13442 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13443 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013444
Victor Stinnerd3f08822012-05-29 12:57:52 +020013445 _PyUnicodeWriter_Init(&writer, fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013446
Guido van Rossumd57fd912000-03-10 22:53:23 +000013447 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 arglen = PyTuple_Size(args);
13449 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013450 }
13451 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013452 arglen = -1;
13453 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013454 }
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013455 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013456 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013457
13458 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013459 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013460 Py_ssize_t nonfmtpos;
13461 nonfmtpos = fmtpos++;
13462 while (fmtcnt >= 0 &&
13463 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13464 fmtpos++;
13465 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013466 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013467 if (fmtcnt < 0)
13468 fmtpos--;
Victor Stinneree4544c2012-05-09 22:24:08 +020013469 sublen = fmtpos - nonfmtpos;
13470 maxchar = _PyUnicode_FindMaxChar(uformat,
13471 nonfmtpos, nonfmtpos + sublen);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013472 if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013473 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013474
Victor Stinnerd3f08822012-05-29 12:57:52 +020013475 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13476 uformat, nonfmtpos, sublen);
Victor Stinneree4544c2012-05-09 22:24:08 +020013477 writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013478 }
13479 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013480 /* Got a format specifier */
13481 int flags = 0;
13482 Py_ssize_t width = -1;
13483 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013484 Py_UCS4 c = '\0';
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013485 Py_UCS4 fill;
13486 int sign;
13487 Py_UCS4 signchar;
Benjamin Peterson29060642009-01-31 22:14:21 +000013488 int isnumok;
13489 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013490 void *pbuf = NULL;
13491 Py_ssize_t pindex, len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013492 Py_UCS4 bufmaxchar;
13493 Py_ssize_t buflen;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013495 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013496 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13497 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013498 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013499 Py_ssize_t keylen;
13500 PyObject *key;
13501 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013502
Benjamin Peterson29060642009-01-31 22:14:21 +000013503 if (dict == NULL) {
13504 PyErr_SetString(PyExc_TypeError,
13505 "format requires a mapping");
13506 goto onError;
13507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013508 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013509 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013510 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013511 /* Skip over balanced parentheses */
13512 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013513 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13514 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013515 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013516 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013517 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013518 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013519 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013520 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013521 if (fmtcnt < 0 || pcount > 0) {
13522 PyErr_SetString(PyExc_ValueError,
13523 "incomplete format key");
13524 goto onError;
13525 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013526 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013527 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013528 if (key == NULL)
13529 goto onError;
13530 if (args_owned) {
13531 Py_DECREF(args);
13532 args_owned = 0;
13533 }
13534 args = PyObject_GetItem(dict, key);
13535 Py_DECREF(key);
13536 if (args == NULL) {
13537 goto onError;
13538 }
13539 args_owned = 1;
13540 arglen = -1;
13541 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013542 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013543 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013544 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13545 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013546 case '-': flags |= F_LJUST; continue;
13547 case '+': flags |= F_SIGN; continue;
13548 case ' ': flags |= F_BLANK; continue;
13549 case '#': flags |= F_ALT; continue;
13550 case '0': flags |= F_ZERO; continue;
13551 }
13552 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013553 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013554 if (c == '*') {
13555 v = getnextarg(args, arglen, &argidx);
13556 if (v == NULL)
13557 goto onError;
13558 if (!PyLong_Check(v)) {
13559 PyErr_SetString(PyExc_TypeError,
13560 "* wants int");
13561 goto onError;
13562 }
13563 width = PyLong_AsLong(v);
13564 if (width == -1 && PyErr_Occurred())
13565 goto onError;
13566 if (width < 0) {
13567 flags |= F_LJUST;
13568 width = -width;
13569 }
13570 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013571 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013572 }
13573 else if (c >= '0' && c <= '9') {
13574 width = c - '0';
13575 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013576 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013577 if (c < '0' || c > '9')
13578 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013579 /* Since c is unsigned, the RHS would end up as unsigned,
13580 mixing signed and unsigned comparison. Since c is between
13581 '0' and '9', casting to int is safe. */
13582 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013583 PyErr_SetString(PyExc_ValueError,
13584 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013585 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013586 }
13587 width = width*10 + (c - '0');
13588 }
13589 }
13590 if (c == '.') {
13591 prec = 0;
13592 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013593 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013594 if (c == '*') {
13595 v = getnextarg(args, arglen, &argidx);
13596 if (v == NULL)
13597 goto onError;
13598 if (!PyLong_Check(v)) {
13599 PyErr_SetString(PyExc_TypeError,
13600 "* wants int");
13601 goto onError;
13602 }
13603 prec = PyLong_AsLong(v);
13604 if (prec == -1 && PyErr_Occurred())
13605 goto onError;
13606 if (prec < 0)
13607 prec = 0;
13608 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013609 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013610 }
13611 else if (c >= '0' && c <= '9') {
13612 prec = c - '0';
13613 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013614 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013615 if (c < '0' || c > '9')
13616 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013617 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013618 PyErr_SetString(PyExc_ValueError,
13619 "prec too big");
13620 goto onError;
13621 }
13622 prec = prec*10 + (c - '0');
13623 }
13624 }
13625 } /* prec */
13626 if (fmtcnt >= 0) {
13627 if (c == 'h' || c == 'l' || c == 'L') {
13628 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013629 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013630 }
13631 }
13632 if (fmtcnt < 0) {
13633 PyErr_SetString(PyExc_ValueError,
13634 "incomplete format");
13635 goto onError;
13636 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013637 if (fmtcnt == 0)
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013638 writer.overallocate = 0;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013639
13640 if (c == '%') {
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013641 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013642 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013643 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13644 writer.pos += 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013645 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013646 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013647
Victor Stinneraff3cc62012-04-30 05:19:21 +020013648 v = getnextarg(args, arglen, &argidx);
13649 if (v == NULL)
13650 goto onError;
13651
Benjamin Peterson29060642009-01-31 22:14:21 +000013652 sign = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013653 signchar = '\0';
Benjamin Peterson29060642009-01-31 22:14:21 +000013654 fill = ' ';
13655 switch (c) {
13656
Benjamin Peterson29060642009-01-31 22:14:21 +000013657 case 's':
13658 case 'r':
13659 case 'a':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013660 if (PyLong_CheckExact(v) && width == -1 && prec == -1) {
13661 /* Fast path */
13662 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13663 goto onError;
13664 goto nextarg;
13665 }
13666
Victor Stinner808fc0a2010-03-22 12:50:40 +000013667 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013668 temp = v;
13669 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013670 }
13671 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013672 if (c == 's')
13673 temp = PyObject_Str(v);
13674 else if (c == 'r')
13675 temp = PyObject_Repr(v);
13676 else
13677 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013678 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013679 break;
13680
13681 case 'i':
13682 case 'd':
13683 case 'u':
13684 case 'o':
13685 case 'x':
13686 case 'X':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013687 if (PyLong_CheckExact(v)
13688 && width == -1 && prec == -1
13689 && !(flags & (F_SIGN | F_BLANK)))
13690 {
13691 /* Fast path */
13692 switch(c)
13693 {
13694 case 'd':
13695 case 'i':
13696 case 'u':
13697 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13698 goto onError;
13699 goto nextarg;
13700 case 'x':
13701 if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1)
13702 goto onError;
13703 goto nextarg;
13704 case 'o':
13705 if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1)
13706 goto onError;
13707 goto nextarg;
13708 default:
13709 break;
13710 }
13711 }
13712
Benjamin Peterson29060642009-01-31 22:14:21 +000013713 isnumok = 0;
13714 if (PyNumber_Check(v)) {
13715 PyObject *iobj=NULL;
13716
13717 if (PyLong_Check(v)) {
13718 iobj = v;
13719 Py_INCREF(iobj);
13720 }
13721 else {
13722 iobj = PyNumber_Long(v);
13723 }
13724 if (iobj!=NULL) {
13725 if (PyLong_Check(iobj)) {
13726 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013727 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013728 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013729 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013730 }
13731 else {
13732 Py_DECREF(iobj);
13733 }
13734 }
13735 }
13736 if (!isnumok) {
13737 PyErr_Format(PyExc_TypeError,
13738 "%%%c format: a number is required, "
13739 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13740 goto onError;
13741 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013742 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013743 fill = '0';
13744 break;
13745
13746 case 'e':
13747 case 'E':
13748 case 'f':
13749 case 'F':
13750 case 'g':
13751 case 'G':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013752 if (width == -1 && prec == -1
13753 && !(flags & (F_SIGN | F_BLANK)))
13754 {
13755 /* Fast path */
13756 if (formatfloat(v, flags, prec, c, NULL, &writer) == -1)
13757 goto onError;
13758 goto nextarg;
13759 }
13760
Benjamin Peterson29060642009-01-31 22:14:21 +000013761 sign = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013762 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013763 fill = '0';
Victor Stinnerd3f08822012-05-29 12:57:52 +020013764 if (formatfloat(v, flags, prec, c, &temp, NULL) == -1)
13765 temp = NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000013766 break;
13767
13768 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013769 {
13770 Py_UCS4 ch = formatchar(v);
13771 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013772 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013773 if (width == -1 && prec == -1) {
13774 /* Fast path */
13775 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
13776 goto onError;
13777 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
13778 writer.pos += 1;
13779 goto nextarg;
13780 }
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013781 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000013782 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013783 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013784
13785 default:
13786 PyErr_Format(PyExc_ValueError,
13787 "unsupported format character '%c' (0x%x) "
13788 "at index %zd",
13789 (31<=c && c<=126) ? (char)c : '?',
13790 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013791 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013792 goto onError;
13793 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013794 if (temp == NULL)
13795 goto onError;
13796 assert (PyUnicode_Check(temp));
Victor Stinnerd3f08822012-05-29 12:57:52 +020013797
13798 if (width == -1 && prec == -1
13799 && !(flags & (F_SIGN | F_BLANK)))
13800 {
13801 /* Fast path */
13802 if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1)
13803 goto onError;
13804 goto nextarg;
13805 }
13806
Victor Stinneraff3cc62012-04-30 05:19:21 +020013807 if (PyUnicode_READY(temp) == -1) {
13808 Py_CLEAR(temp);
13809 goto onError;
13810 }
13811 kind = PyUnicode_KIND(temp);
13812 pbuf = PyUnicode_DATA(temp);
13813 len = PyUnicode_GET_LENGTH(temp);
13814
13815 if (c == 's' || c == 'r' || c == 'a') {
13816 if (prec >= 0 && len > prec)
13817 len = prec;
13818 }
13819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013820 /* pbuf is initialized here. */
13821 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013822 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013823 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13824 if (ch == '-' || ch == '+') {
13825 signchar = ch;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013826 len--;
13827 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013828 }
13829 else if (flags & F_SIGN)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013830 signchar = '+';
Benjamin Peterson29060642009-01-31 22:14:21 +000013831 else if (flags & F_BLANK)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013832 signchar = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +000013833 else
13834 sign = 0;
13835 }
13836 if (width < len)
13837 width = len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013838
13839 /* Compute the length and maximum character of the
13840 written characters */
13841 bufmaxchar = 127;
13842 if (!(flags & F_LJUST)) {
13843 if (sign) {
13844 if ((width-1) > len)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013845 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013846 }
13847 else {
13848 if (width > len)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013849 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013850 }
13851 }
13852 maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013853 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
Victor Stinneree4544c2012-05-09 22:24:08 +020013854
13855 buflen = width;
13856 if (sign && len == width)
13857 buflen++;
13858
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013859 if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
Victor Stinneree4544c2012-05-09 22:24:08 +020013860 goto onError;
13861
13862 /* Write characters */
Benjamin Peterson29060642009-01-31 22:14:21 +000013863 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013864 if (fill != ' ') {
Victor Stinneree4544c2012-05-09 22:24:08 +020013865 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13866 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013867 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013868 if (width > len)
13869 width--;
13870 }
13871 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013872 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013873 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013874 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013875 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13876 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13877 writer.pos += 2;
13878 pindex += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +000013879 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013880 width -= 2;
13881 if (width < 0)
13882 width = 0;
13883 len -= 2;
13884 }
13885 if (width > len && !(flags & F_LJUST)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013886 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013887 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
13888 writer.pos += sublen;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013889 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013890 }
13891 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013892 if (sign) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013893 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13894 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013895 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013896 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013897 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13898 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013899 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13900 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13901 writer.pos += 2;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013902 pindex += 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013903 }
13904 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013905
Victor Stinnerc9d369f2012-06-16 02:22:37 +020013906 if (len) {
13907 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13908 temp, pindex, len);
13909 writer.pos += len;
13910 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013911 if (width > len) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013912 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013913 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
13914 writer.pos += sublen;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013915 }
Victor Stinneree4544c2012-05-09 22:24:08 +020013916
Victor Stinnerd3f08822012-05-29 12:57:52 +020013917nextarg:
Benjamin Peterson29060642009-01-31 22:14:21 +000013918 if (dict && (argidx < arglen) && c != '%') {
13919 PyErr_SetString(PyExc_TypeError,
13920 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013921 goto onError;
13922 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013923 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013924 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013925 } /* until end */
13926 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013927 PyErr_SetString(PyExc_TypeError,
13928 "not all arguments converted during string formatting");
13929 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013930 }
13931
13932 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013933 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013934 }
13935 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013936 Py_XDECREF(temp);
13937 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013938 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013939
Benjamin Peterson29060642009-01-31 22:14:21 +000013940 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013941 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013942 Py_XDECREF(temp);
13943 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013944 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013945 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013946 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013947 }
13948 return NULL;
13949}
13950
Jeremy Hylton938ace62002-07-17 16:30:39 +000013951static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013952unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13953
Tim Peters6d6c1a32001-08-02 04:15:00 +000013954static PyObject *
13955unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13956{
Benjamin Peterson29060642009-01-31 22:14:21 +000013957 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013958 static char *kwlist[] = {"object", "encoding", "errors", 0};
13959 char *encoding = NULL;
13960 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013961
Benjamin Peterson14339b62009-01-31 16:36:08 +000013962 if (type != &PyUnicode_Type)
13963 return unicode_subtype_new(type, args, kwds);
13964 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013965 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013966 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013967 if (x == NULL) {
13968 Py_INCREF(unicode_empty);
13969 return unicode_empty;
13970 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013971 if (encoding == NULL && errors == NULL)
13972 return PyObject_Str(x);
13973 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013974 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013975}
13976
Guido van Rossume023fe02001-08-30 03:12:59 +000013977static PyObject *
13978unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13979{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013980 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013981 Py_ssize_t length, char_size;
13982 int share_wstr, share_utf8;
13983 unsigned int kind;
13984 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013985
Benjamin Peterson14339b62009-01-31 16:36:08 +000013986 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013987
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013988 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013989 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013990 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013991 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050013992 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013993 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013994 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013995 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013996
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013997 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013998 if (self == NULL) {
13999 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014000 return NULL;
14001 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014002 kind = PyUnicode_KIND(unicode);
14003 length = PyUnicode_GET_LENGTH(unicode);
14004
14005 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014006#ifdef Py_DEBUG
14007 _PyUnicode_HASH(self) = -1;
14008#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014009 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014010#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014011 _PyUnicode_STATE(self).interned = 0;
14012 _PyUnicode_STATE(self).kind = kind;
14013 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014014 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014015 _PyUnicode_STATE(self).ready = 1;
14016 _PyUnicode_WSTR(self) = NULL;
14017 _PyUnicode_UTF8_LENGTH(self) = 0;
14018 _PyUnicode_UTF8(self) = NULL;
14019 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014020 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014021
14022 share_utf8 = 0;
14023 share_wstr = 0;
14024 if (kind == PyUnicode_1BYTE_KIND) {
14025 char_size = 1;
14026 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14027 share_utf8 = 1;
14028 }
14029 else if (kind == PyUnicode_2BYTE_KIND) {
14030 char_size = 2;
14031 if (sizeof(wchar_t) == 2)
14032 share_wstr = 1;
14033 }
14034 else {
14035 assert(kind == PyUnicode_4BYTE_KIND);
14036 char_size = 4;
14037 if (sizeof(wchar_t) == 4)
14038 share_wstr = 1;
14039 }
14040
14041 /* Ensure we won't overflow the length. */
14042 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14043 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014044 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014045 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014046 data = PyObject_MALLOC((length + 1) * char_size);
14047 if (data == NULL) {
14048 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014049 goto onError;
14050 }
14051
Victor Stinnerc3c74152011-10-02 20:39:55 +020014052 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014053 if (share_utf8) {
14054 _PyUnicode_UTF8_LENGTH(self) = length;
14055 _PyUnicode_UTF8(self) = data;
14056 }
14057 if (share_wstr) {
14058 _PyUnicode_WSTR_LENGTH(self) = length;
14059 _PyUnicode_WSTR(self) = (wchar_t *)data;
14060 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014061
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014062 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014063 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014064 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014065#ifdef Py_DEBUG
14066 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14067#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014068 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014069 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014070
14071onError:
14072 Py_DECREF(unicode);
14073 Py_DECREF(self);
14074 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014075}
14076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014077PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014078"str(object='') -> str\n\
14079str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014080\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014081Create a new string object from the given object. If encoding or\n\
14082errors is specified, then the object must expose a data buffer\n\
14083that will be decoded using the given encoding and error handler.\n\
14084Otherwise, returns the result of object.__str__() (if defined)\n\
14085or repr(object).\n\
14086encoding defaults to sys.getdefaultencoding().\n\
14087errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014088
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014089static PyObject *unicode_iter(PyObject *seq);
14090
Guido van Rossumd57fd912000-03-10 22:53:23 +000014091PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014092 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014093 "str", /* tp_name */
14094 sizeof(PyUnicodeObject), /* tp_size */
14095 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014096 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014097 (destructor)unicode_dealloc, /* tp_dealloc */
14098 0, /* tp_print */
14099 0, /* tp_getattr */
14100 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014101 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014102 unicode_repr, /* tp_repr */
14103 &unicode_as_number, /* tp_as_number */
14104 &unicode_as_sequence, /* tp_as_sequence */
14105 &unicode_as_mapping, /* tp_as_mapping */
14106 (hashfunc) unicode_hash, /* tp_hash*/
14107 0, /* tp_call*/
14108 (reprfunc) unicode_str, /* tp_str */
14109 PyObject_GenericGetAttr, /* tp_getattro */
14110 0, /* tp_setattro */
14111 0, /* tp_as_buffer */
14112 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014113 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014114 unicode_doc, /* tp_doc */
14115 0, /* tp_traverse */
14116 0, /* tp_clear */
14117 PyUnicode_RichCompare, /* tp_richcompare */
14118 0, /* tp_weaklistoffset */
14119 unicode_iter, /* tp_iter */
14120 0, /* tp_iternext */
14121 unicode_methods, /* tp_methods */
14122 0, /* tp_members */
14123 0, /* tp_getset */
14124 &PyBaseObject_Type, /* tp_base */
14125 0, /* tp_dict */
14126 0, /* tp_descr_get */
14127 0, /* tp_descr_set */
14128 0, /* tp_dictoffset */
14129 0, /* tp_init */
14130 0, /* tp_alloc */
14131 unicode_new, /* tp_new */
14132 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014133};
14134
14135/* Initialize the Unicode implementation */
14136
Victor Stinner3a50e702011-10-18 21:21:00 +020014137int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014138{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014139 int i;
14140
Thomas Wouters477c8d52006-05-27 19:21:47 +000014141 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014142 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014143 0x000A, /* LINE FEED */
14144 0x000D, /* CARRIAGE RETURN */
14145 0x001C, /* FILE SEPARATOR */
14146 0x001D, /* GROUP SEPARATOR */
14147 0x001E, /* RECORD SEPARATOR */
14148 0x0085, /* NEXT LINE */
14149 0x2028, /* LINE SEPARATOR */
14150 0x2029, /* PARAGRAPH SEPARATOR */
14151 };
14152
Fred Drakee4315f52000-05-09 19:53:39 +000014153 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020014154 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014155 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014156 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010014157 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014158
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014159 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000014160 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000014161 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014162 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014163
14164 /* initialize the linebreak bloom filter */
14165 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014166 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014167 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014168
14169 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014170
Benjamin Petersonc4311282012-10-30 23:21:10 -040014171 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14172 Py_FatalError("Can't initialize field name iterator type");
14173
14174 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14175 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014176
Victor Stinner3a50e702011-10-18 21:21:00 +020014177#ifdef HAVE_MBCS
14178 winver.dwOSVersionInfoSize = sizeof(winver);
14179 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14180 PyErr_SetFromWindowsErr(0);
14181 return -1;
14182 }
14183#endif
14184 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014185}
14186
14187/* Finalize the Unicode implementation */
14188
Christian Heimesa156e092008-02-16 07:38:31 +000014189int
14190PyUnicode_ClearFreeList(void)
14191{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014192 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014193}
14194
Guido van Rossumd57fd912000-03-10 22:53:23 +000014195void
Thomas Wouters78890102000-07-22 19:25:51 +000014196_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014197{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014198 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014199
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000014200 Py_XDECREF(unicode_empty);
14201 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014202
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014203 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014204 if (unicode_latin1[i]) {
14205 Py_DECREF(unicode_latin1[i]);
14206 unicode_latin1[i] = NULL;
14207 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014208 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014209 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014210 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014211}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014212
Walter Dörwald16807132007-05-25 13:52:07 +000014213void
14214PyUnicode_InternInPlace(PyObject **p)
14215{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014216 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014217 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014218#ifdef Py_DEBUG
14219 assert(s != NULL);
14220 assert(_PyUnicode_CHECK(s));
14221#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014222 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014223 return;
14224#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014225 /* If it's a subclass, we don't really know what putting
14226 it in the interned dict might do. */
14227 if (!PyUnicode_CheckExact(s))
14228 return;
14229 if (PyUnicode_CHECK_INTERNED(s))
14230 return;
14231 if (interned == NULL) {
14232 interned = PyDict_New();
14233 if (interned == NULL) {
14234 PyErr_Clear(); /* Don't leave an exception */
14235 return;
14236 }
14237 }
14238 /* It might be that the GetItem call fails even
14239 though the key is present in the dictionary,
14240 namely when this happens during a stack overflow. */
14241 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014242 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014243 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014244
Benjamin Peterson29060642009-01-31 22:14:21 +000014245 if (t) {
14246 Py_INCREF(t);
14247 Py_DECREF(*p);
14248 *p = t;
14249 return;
14250 }
Walter Dörwald16807132007-05-25 13:52:07 +000014251
Benjamin Peterson14339b62009-01-31 16:36:08 +000014252 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014253 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014254 PyErr_Clear();
14255 PyThreadState_GET()->recursion_critical = 0;
14256 return;
14257 }
14258 PyThreadState_GET()->recursion_critical = 0;
14259 /* The two references in interned are not counted by refcnt.
14260 The deallocator will take care of this */
14261 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014262 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014263}
14264
14265void
14266PyUnicode_InternImmortal(PyObject **p)
14267{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014268 PyUnicode_InternInPlace(p);
14269 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014270 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014271 Py_INCREF(*p);
14272 }
Walter Dörwald16807132007-05-25 13:52:07 +000014273}
14274
14275PyObject *
14276PyUnicode_InternFromString(const char *cp)
14277{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014278 PyObject *s = PyUnicode_FromString(cp);
14279 if (s == NULL)
14280 return NULL;
14281 PyUnicode_InternInPlace(&s);
14282 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014283}
14284
Alexander Belopolsky40018472011-02-26 01:02:56 +000014285void
14286_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014287{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014288 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014289 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014290 Py_ssize_t i, n;
14291 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014292
Benjamin Peterson14339b62009-01-31 16:36:08 +000014293 if (interned == NULL || !PyDict_Check(interned))
14294 return;
14295 keys = PyDict_Keys(interned);
14296 if (keys == NULL || !PyList_Check(keys)) {
14297 PyErr_Clear();
14298 return;
14299 }
Walter Dörwald16807132007-05-25 13:52:07 +000014300
Benjamin Peterson14339b62009-01-31 16:36:08 +000014301 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14302 detector, interned unicode strings are not forcibly deallocated;
14303 rather, we give them their stolen references back, and then clear
14304 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014305
Benjamin Peterson14339b62009-01-31 16:36:08 +000014306 n = PyList_GET_SIZE(keys);
14307 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014308 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014309 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014310 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014311 if (PyUnicode_READY(s) == -1) {
14312 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014313 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014315 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014316 case SSTATE_NOT_INTERNED:
14317 /* XXX Shouldn't happen */
14318 break;
14319 case SSTATE_INTERNED_IMMORTAL:
14320 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014321 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014322 break;
14323 case SSTATE_INTERNED_MORTAL:
14324 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014325 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014326 break;
14327 default:
14328 Py_FatalError("Inconsistent interned string state.");
14329 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014330 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014331 }
14332 fprintf(stderr, "total size of all interned strings: "
14333 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14334 "mortal/immortal\n", mortal_size, immortal_size);
14335 Py_DECREF(keys);
14336 PyDict_Clear(interned);
14337 Py_DECREF(interned);
14338 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014339}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014340
14341
14342/********************* Unicode Iterator **************************/
14343
14344typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014345 PyObject_HEAD
14346 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014347 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014348} unicodeiterobject;
14349
14350static void
14351unicodeiter_dealloc(unicodeiterobject *it)
14352{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014353 _PyObject_GC_UNTRACK(it);
14354 Py_XDECREF(it->it_seq);
14355 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014356}
14357
14358static int
14359unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14360{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014361 Py_VISIT(it->it_seq);
14362 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014363}
14364
14365static PyObject *
14366unicodeiter_next(unicodeiterobject *it)
14367{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014368 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014369
Benjamin Peterson14339b62009-01-31 16:36:08 +000014370 assert(it != NULL);
14371 seq = it->it_seq;
14372 if (seq == NULL)
14373 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014374 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014376 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14377 int kind = PyUnicode_KIND(seq);
14378 void *data = PyUnicode_DATA(seq);
14379 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14380 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014381 if (item != NULL)
14382 ++it->it_index;
14383 return item;
14384 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014385
Benjamin Peterson14339b62009-01-31 16:36:08 +000014386 Py_DECREF(seq);
14387 it->it_seq = NULL;
14388 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014389}
14390
14391static PyObject *
14392unicodeiter_len(unicodeiterobject *it)
14393{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014394 Py_ssize_t len = 0;
14395 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014396 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014397 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014398}
14399
14400PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14401
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014402static PyObject *
14403unicodeiter_reduce(unicodeiterobject *it)
14404{
14405 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014406 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014407 it->it_seq, it->it_index);
14408 } else {
14409 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14410 if (u == NULL)
14411 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014412 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014413 }
14414}
14415
14416PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14417
14418static PyObject *
14419unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14420{
14421 Py_ssize_t index = PyLong_AsSsize_t(state);
14422 if (index == -1 && PyErr_Occurred())
14423 return NULL;
14424 if (index < 0)
14425 index = 0;
14426 it->it_index = index;
14427 Py_RETURN_NONE;
14428}
14429
14430PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14431
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014432static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014433 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014434 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014435 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14436 reduce_doc},
14437 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14438 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014439 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014440};
14441
14442PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014443 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14444 "str_iterator", /* tp_name */
14445 sizeof(unicodeiterobject), /* tp_basicsize */
14446 0, /* tp_itemsize */
14447 /* methods */
14448 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14449 0, /* tp_print */
14450 0, /* tp_getattr */
14451 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014452 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014453 0, /* tp_repr */
14454 0, /* tp_as_number */
14455 0, /* tp_as_sequence */
14456 0, /* tp_as_mapping */
14457 0, /* tp_hash */
14458 0, /* tp_call */
14459 0, /* tp_str */
14460 PyObject_GenericGetAttr, /* tp_getattro */
14461 0, /* tp_setattro */
14462 0, /* tp_as_buffer */
14463 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14464 0, /* tp_doc */
14465 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14466 0, /* tp_clear */
14467 0, /* tp_richcompare */
14468 0, /* tp_weaklistoffset */
14469 PyObject_SelfIter, /* tp_iter */
14470 (iternextfunc)unicodeiter_next, /* tp_iternext */
14471 unicodeiter_methods, /* tp_methods */
14472 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014473};
14474
14475static PyObject *
14476unicode_iter(PyObject *seq)
14477{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014478 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014479
Benjamin Peterson14339b62009-01-31 16:36:08 +000014480 if (!PyUnicode_Check(seq)) {
14481 PyErr_BadInternalCall();
14482 return NULL;
14483 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014484 if (PyUnicode_READY(seq) == -1)
14485 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014486 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14487 if (it == NULL)
14488 return NULL;
14489 it->it_index = 0;
14490 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014491 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014492 _PyObject_GC_TRACK(it);
14493 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014494}
14495
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014496
14497size_t
14498Py_UNICODE_strlen(const Py_UNICODE *u)
14499{
14500 int res = 0;
14501 while(*u++)
14502 res++;
14503 return res;
14504}
14505
14506Py_UNICODE*
14507Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14508{
14509 Py_UNICODE *u = s1;
14510 while ((*u++ = *s2++));
14511 return s1;
14512}
14513
14514Py_UNICODE*
14515Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14516{
14517 Py_UNICODE *u = s1;
14518 while ((*u++ = *s2++))
14519 if (n-- == 0)
14520 break;
14521 return s1;
14522}
14523
14524Py_UNICODE*
14525Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14526{
14527 Py_UNICODE *u1 = s1;
14528 u1 += Py_UNICODE_strlen(u1);
14529 Py_UNICODE_strcpy(u1, s2);
14530 return s1;
14531}
14532
14533int
14534Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14535{
14536 while (*s1 && *s2 && *s1 == *s2)
14537 s1++, s2++;
14538 if (*s1 && *s2)
14539 return (*s1 < *s2) ? -1 : +1;
14540 if (*s1)
14541 return 1;
14542 if (*s2)
14543 return -1;
14544 return 0;
14545}
14546
14547int
14548Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14549{
14550 register Py_UNICODE u1, u2;
14551 for (; n != 0; n--) {
14552 u1 = *s1;
14553 u2 = *s2;
14554 if (u1 != u2)
14555 return (u1 < u2) ? -1 : +1;
14556 if (u1 == '\0')
14557 return 0;
14558 s1++;
14559 s2++;
14560 }
14561 return 0;
14562}
14563
14564Py_UNICODE*
14565Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14566{
14567 const Py_UNICODE *p;
14568 for (p = s; *p; p++)
14569 if (*p == c)
14570 return (Py_UNICODE*)p;
14571 return NULL;
14572}
14573
14574Py_UNICODE*
14575Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14576{
14577 const Py_UNICODE *p;
14578 p = s + Py_UNICODE_strlen(s);
14579 while (p != s) {
14580 p--;
14581 if (*p == c)
14582 return (Py_UNICODE*)p;
14583 }
14584 return NULL;
14585}
Victor Stinner331ea922010-08-10 16:37:20 +000014586
Victor Stinner71133ff2010-09-01 23:43:53 +000014587Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014588PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014589{
Victor Stinner577db2c2011-10-11 22:12:48 +020014590 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014591 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014593 if (!PyUnicode_Check(unicode)) {
14594 PyErr_BadArgument();
14595 return NULL;
14596 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014597 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014598 if (u == NULL)
14599 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014600 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014601 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014602 PyErr_NoMemory();
14603 return NULL;
14604 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014605 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014606 size *= sizeof(Py_UNICODE);
14607 copy = PyMem_Malloc(size);
14608 if (copy == NULL) {
14609 PyErr_NoMemory();
14610 return NULL;
14611 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014612 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014613 return copy;
14614}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014615
Georg Brandl66c221e2010-10-14 07:04:07 +000014616/* A _string module, to export formatter_parser and formatter_field_name_split
14617 to the string.Formatter class implemented in Python. */
14618
14619static PyMethodDef _string_methods[] = {
14620 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14621 METH_O, PyDoc_STR("split the argument as a field name")},
14622 {"formatter_parser", (PyCFunction) formatter_parser,
14623 METH_O, PyDoc_STR("parse the argument as a format string")},
14624 {NULL, NULL}
14625};
14626
14627static struct PyModuleDef _string_module = {
14628 PyModuleDef_HEAD_INIT,
14629 "_string",
14630 PyDoc_STR("string helper module"),
14631 0,
14632 _string_methods,
14633 NULL,
14634 NULL,
14635 NULL,
14636 NULL
14637};
14638
14639PyMODINIT_FUNC
14640PyInit__string(void)
14641{
14642 return PyModule_Create(&_string_module);
14643}
14644
14645
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014646#ifdef __cplusplus
14647}
14648#endif