blob: abe793dfd4870859b8dc313af323d8ef3c4d5ef5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
Serhiy Storchaka05997252013-01-26 12:14:02 +020060NOTE: In the interpreter's initialization phase, some globals are currently
61 initialized dynamically as needed. In the process Unicode objects may
62 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000063
64*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000065
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000066
67#ifdef __cplusplus
68extern "C" {
69#endif
70
Victor Stinner8faf8212011-12-08 22:14:11 +010071/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
72#define MAX_UNICODE 0x10ffff
73
Victor Stinner910337b2011-10-03 03:20:16 +020074#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020075# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020076#else
77# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
78#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020079
Victor Stinnere90fe6a2011-10-01 16:48:13 +020080#define _PyUnicode_UTF8(op) \
81 (((PyCompactUnicodeObject*)(op))->utf8)
82#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((char*)((PyASCIIObject*)(op) + 1)) : \
87 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020088#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 (((PyCompactUnicodeObject*)(op))->utf8_length)
90#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020091 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020092 assert(PyUnicode_IS_READY(op)), \
93 PyUnicode_IS_COMPACT_ASCII(op) ? \
94 ((PyASCIIObject*)(op))->length : \
95 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020096#define _PyUnicode_WSTR(op) \
97 (((PyASCIIObject*)(op))->wstr)
98#define _PyUnicode_WSTR_LENGTH(op) \
99 (((PyCompactUnicodeObject*)(op))->wstr_length)
100#define _PyUnicode_LENGTH(op) \
101 (((PyASCIIObject *)(op))->length)
102#define _PyUnicode_STATE(op) \
103 (((PyASCIIObject *)(op))->state)
104#define _PyUnicode_HASH(op) \
105 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_KIND(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200109#define _PyUnicode_GET_LENGTH(op) \
110 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200112#define _PyUnicode_DATA_ANY(op) \
113 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114
Victor Stinnere6abb482012-05-02 01:15:40 +0200115/* Optimized version of Py_MAX() to compute the maximum character:
116 use it when your are computing the second argument of PyUnicode_New() */
117#define MAX_MAXCHAR(maxchar1, maxchar2) \
118 ((maxchar1) | (maxchar2))
119
Victor Stinner910337b2011-10-03 03:20:16 +0200120#undef PyUnicode_READY
121#define PyUnicode_READY(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200124 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100125 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200126
Victor Stinnerc379ead2011-10-03 12:52:27 +0200127#define _PyUnicode_SHARE_UTF8(op) \
128 (assert(_PyUnicode_CHECK(op)), \
129 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
130 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
131#define _PyUnicode_SHARE_WSTR(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
134
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135/* true if the Unicode object has an allocated UTF-8 memory block
136 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200137#define _PyUnicode_HAS_UTF8_MEMORY(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 (!PyUnicode_IS_COMPACT_ASCII(op) \
140 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200141 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
142
Victor Stinner03490912011-10-03 23:45:12 +0200143/* true if the Unicode object has an allocated wstr memory block
144 (not shared with other data) */
145#define _PyUnicode_HAS_WSTR_MEMORY(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 (_PyUnicode_WSTR(op) && \
148 (!PyUnicode_IS_READY(op) || \
149 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
150
Victor Stinner910337b2011-10-03 03:20:16 +0200151/* Generic helper macro to convert characters of different types.
152 from_type and to_type have to be valid type names, begin and end
153 are pointers to the source characters which should be of type
154 "from_type *". to is a pointer of type "to_type *" and points to the
155 buffer where the result characters are written to. */
156#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
157 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200158 to_type *_to = (to_type *) to; \
159 const from_type *_iter = (begin); \
160 const from_type *_end = (end); \
161 Py_ssize_t n = (_end) - (_iter); \
162 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200163 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200164 while (_iter < (_unrolled_end)) { \
165 _to[0] = (to_type) _iter[0]; \
166 _to[1] = (to_type) _iter[1]; \
167 _to[2] = (to_type) _iter[2]; \
168 _to[3] = (to_type) _iter[3]; \
169 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200170 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200171 while (_iter < (_end)) \
172 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200173 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200174
Walter Dörwald16807132007-05-25 13:52:07 +0000175/* This dictionary holds all interned unicode strings. Note that references
176 to strings in this dictionary are *not* counted in the string's ob_refcnt.
177 When the interned string reaches a refcnt of 0 the string deallocation
178 function will delete the reference from this dictionary.
179
180 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000181 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000182*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200183static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200187
Serhiy Storchaka678db842013-01-26 12:16:36 +0200188#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 do { \
190 if (unicode_empty != NULL) \
191 Py_INCREF(unicode_empty); \
192 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200193 unicode_empty = PyUnicode_New(0, 0); \
194 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200195 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200196 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200199 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000200
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201#define _Py_RETURN_UNICODE_EMPTY() \
202 do { \
203 _Py_INCREF_UNICODE_EMPTY(); \
204 return unicode_empty; \
205 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000206
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200207/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200208static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200209
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210/* Single character Unicode strings in the Latin-1 range are being
211 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213
Christian Heimes190d79e2008-01-30 11:58:22 +0000214/* Fast detection of the most frequent whitespace characters */
215const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000217/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000219/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000220/* case 0x000C: * FORM FEED */
221/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 1, 1, 1, 1, 1, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000224/* case 0x001C: * FILE SEPARATOR */
225/* case 0x001D: * GROUP SEPARATOR */
226/* case 0x001E: * RECORD SEPARATOR */
227/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000228 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000229/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000230 1, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000234
Benjamin Peterson14339b62009-01-31 16:36:08 +0000235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000243};
244
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200245/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200246static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200247static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100248static int unicode_modifiable(PyObject *unicode);
249
Victor Stinnerfe226c02011-10-03 03:52:20 +0200250
Alexander Belopolsky40018472011-02-26 01:02:56 +0000251static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200252_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
253static PyObject *
254_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
255static PyObject *
256_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
257
258static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000259unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000260 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100261 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000262 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
263
Alexander Belopolsky40018472011-02-26 01:02:56 +0000264static void
265raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300266 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100267 PyObject *unicode,
268 Py_ssize_t startpos, Py_ssize_t endpos,
269 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000270
Christian Heimes190d79e2008-01-30 11:58:22 +0000271/* Same for linebreaks */
272static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000275/* 0x000B, * LINE TABULATION */
276/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x001C, * FILE SEPARATOR */
281/* 0x001D, * GROUP SEPARATOR */
282/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 1, 1, 1, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000288
Benjamin Peterson14339b62009-01-31 16:36:08 +0000289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000297};
298
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300299/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
300 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000301Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000302PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000303{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000304#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000305 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000306#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 /* This is actually an illegal character, so it should
308 not be passed to unichr. */
309 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000310#endif
311}
312
Victor Stinner910337b2011-10-03 03:20:16 +0200313#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200314int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100315_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200316{
317 PyASCIIObject *ascii;
318 unsigned int kind;
319
320 assert(PyUnicode_Check(op));
321
322 ascii = (PyASCIIObject *)op;
323 kind = ascii->state.kind;
324
Victor Stinnera3b334d2011-10-03 13:53:37 +0200325 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200326 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200327 assert(ascii->state.ready == 1);
328 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200330 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200331 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200332
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 if (ascii->state.compact == 1) {
334 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(kind == PyUnicode_1BYTE_KIND
336 || kind == PyUnicode_2BYTE_KIND
337 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200338 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200339 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200340 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 }
342 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
344
345 data = unicode->data.any;
346 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100347 assert(ascii->length == 0);
348 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200349 assert(ascii->state.compact == 0);
350 assert(ascii->state.ascii == 0);
351 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100352 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200353 assert(ascii->wstr != NULL);
354 assert(data == NULL);
355 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200356 }
357 else {
358 assert(kind == PyUnicode_1BYTE_KIND
359 || kind == PyUnicode_2BYTE_KIND
360 || kind == PyUnicode_4BYTE_KIND);
361 assert(ascii->state.compact == 0);
362 assert(ascii->state.ready == 1);
363 assert(data != NULL);
364 if (ascii->state.ascii) {
365 assert (compact->utf8 == data);
366 assert (compact->utf8_length == ascii->length);
367 }
368 else
369 assert (compact->utf8 != data);
370 }
371 }
372 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200373 if (
374#if SIZEOF_WCHAR_T == 2
375 kind == PyUnicode_2BYTE_KIND
376#else
377 kind == PyUnicode_4BYTE_KIND
378#endif
379 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200380 {
381 assert(ascii->wstr == data);
382 assert(compact->wstr_length == ascii->length);
383 } else
384 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200385 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200386
387 if (compact->utf8 == NULL)
388 assert(compact->utf8_length == 0);
389 if (ascii->wstr == NULL)
390 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200391 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 /* check that the best kind is used */
393 if (check_content && kind != PyUnicode_WCHAR_KIND)
394 {
395 Py_ssize_t i;
396 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200397 void *data;
398 Py_UCS4 ch;
399
400 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 for (i=0; i < ascii->length; i++)
402 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200403 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 if (ch > maxchar)
405 maxchar = ch;
406 }
407 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100408 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200409 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100410 assert(maxchar <= 255);
411 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 else
413 assert(maxchar < 128);
414 }
Victor Stinner77faf692011-11-20 18:56:05 +0100415 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100417 assert(maxchar <= 0xFFFF);
418 }
419 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100421 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100422 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200423 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200424 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400425 return 1;
426}
Victor Stinner910337b2011-10-03 03:20:16 +0200427#endif
428
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429static PyObject*
430unicode_result_wchar(PyObject *unicode)
431{
432#ifndef Py_DEBUG
433 Py_ssize_t len;
434
435 assert(Py_REFCNT(unicode) == 1);
436
437 len = _PyUnicode_WSTR_LENGTH(unicode);
438 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100439 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200440 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100441 }
442
443 if (len == 1) {
444 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
445 if (ch < 256) {
446 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
447 Py_DECREF(unicode);
448 return latin1_char;
449 }
450 }
451
452 if (_PyUnicode_Ready(unicode) < 0) {
453 Py_XDECREF(unicode);
454 return NULL;
455 }
456#else
457 /* don't make the result ready in debug mode to ensure that the caller
458 makes the string ready before using it */
459 assert(_PyUnicode_CheckConsistency(unicode, 1));
460#endif
461 return unicode;
462}
463
464static PyObject*
465unicode_result_ready(PyObject *unicode)
466{
467 Py_ssize_t length;
468
469 length = PyUnicode_GET_LENGTH(unicode);
470 if (length == 0) {
471 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100472 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200473 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100474 }
475 return unicode_empty;
476 }
477
478 if (length == 1) {
479 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
480 if (ch < 256) {
481 PyObject *latin1_char = unicode_latin1[ch];
482 if (latin1_char != NULL) {
483 if (unicode != latin1_char) {
484 Py_INCREF(latin1_char);
485 Py_DECREF(unicode);
486 }
487 return latin1_char;
488 }
489 else {
490 assert(_PyUnicode_CheckConsistency(unicode, 1));
491 Py_INCREF(unicode);
492 unicode_latin1[ch] = unicode;
493 return unicode;
494 }
495 }
496 }
497
498 assert(_PyUnicode_CheckConsistency(unicode, 1));
499 return unicode;
500}
501
502static PyObject*
503unicode_result(PyObject *unicode)
504{
505 assert(_PyUnicode_CHECK(unicode));
506 if (PyUnicode_IS_READY(unicode))
507 return unicode_result_ready(unicode);
508 else
509 return unicode_result_wchar(unicode);
510}
511
Victor Stinnerc4b49542011-12-11 22:44:26 +0100512static PyObject*
513unicode_result_unchanged(PyObject *unicode)
514{
515 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500516 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100517 return NULL;
518 Py_INCREF(unicode);
519 return unicode;
520 }
521 else
522 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100523 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100524}
525
Victor Stinner3a50e702011-10-18 21:21:00 +0200526#ifdef HAVE_MBCS
527static OSVERSIONINFOEX winver;
528#endif
529
Thomas Wouters477c8d52006-05-27 19:21:47 +0000530/* --- Bloom Filters ----------------------------------------------------- */
531
532/* stuff to implement simple "bloom filters" for Unicode characters.
533 to keep things simple, we use a single bitmask, using the least 5
534 bits from each unicode characters as the bit index. */
535
536/* the linebreak mask is set up by Unicode_Init below */
537
Antoine Pitrouf068f942010-01-13 14:19:12 +0000538#if LONG_BIT >= 128
539#define BLOOM_WIDTH 128
540#elif LONG_BIT >= 64
541#define BLOOM_WIDTH 64
542#elif LONG_BIT >= 32
543#define BLOOM_WIDTH 32
544#else
545#error "LONG_BIT is smaller than 32"
546#endif
547
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548#define BLOOM_MASK unsigned long
549
Serhiy Storchaka05997252013-01-26 12:14:02 +0200550static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Antoine Pitrouf068f942010-01-13 14:19:12 +0000552#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
553#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554
Benjamin Peterson29060642009-01-31 22:14:21 +0000555#define BLOOM_LINEBREAK(ch) \
556 ((ch) < 128U ? ascii_linebreak[(ch)] : \
557 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000558
Alexander Belopolsky40018472011-02-26 01:02:56 +0000559Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200560make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000561{
562 /* calculate simple bloom-style bitmask for a given unicode string */
563
Antoine Pitrouf068f942010-01-13 14:19:12 +0000564 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000565 Py_ssize_t i;
566
567 mask = 0;
568 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200569 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000570
571 return mask;
572}
573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200574#define BLOOM_MEMBER(mask, chr, str) \
575 (BLOOM(mask, chr) \
576 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000577
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200578/* Compilation of templated routines */
579
580#include "stringlib/asciilib.h"
581#include "stringlib/fastsearch.h"
582#include "stringlib/partition.h"
583#include "stringlib/split.h"
584#include "stringlib/count.h"
585#include "stringlib/find.h"
586#include "stringlib/find_max_char.h"
587#include "stringlib/localeutil.h"
588#include "stringlib/undef.h"
589
590#include "stringlib/ucs1lib.h"
591#include "stringlib/fastsearch.h"
592#include "stringlib/partition.h"
593#include "stringlib/split.h"
594#include "stringlib/count.h"
595#include "stringlib/find.h"
596#include "stringlib/find_max_char.h"
597#include "stringlib/localeutil.h"
598#include "stringlib/undef.h"
599
600#include "stringlib/ucs2lib.h"
601#include "stringlib/fastsearch.h"
602#include "stringlib/partition.h"
603#include "stringlib/split.h"
604#include "stringlib/count.h"
605#include "stringlib/find.h"
606#include "stringlib/find_max_char.h"
607#include "stringlib/localeutil.h"
608#include "stringlib/undef.h"
609
610#include "stringlib/ucs4lib.h"
611#include "stringlib/fastsearch.h"
612#include "stringlib/partition.h"
613#include "stringlib/split.h"
614#include "stringlib/count.h"
615#include "stringlib/find.h"
616#include "stringlib/find_max_char.h"
617#include "stringlib/localeutil.h"
618#include "stringlib/undef.h"
619
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200620#include "stringlib/unicodedefs.h"
621#include "stringlib/fastsearch.h"
622#include "stringlib/count.h"
623#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100624#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200625
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626/* --- Unicode Object ----------------------------------------------------- */
627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200628static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200629fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200630
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200631Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
632 Py_ssize_t size, Py_UCS4 ch,
633 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200635 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
636
637 switch (kind) {
638 case PyUnicode_1BYTE_KIND:
639 {
640 Py_UCS1 ch1 = (Py_UCS1) ch;
641 if (ch1 == ch)
642 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
643 else
644 return -1;
645 }
646 case PyUnicode_2BYTE_KIND:
647 {
648 Py_UCS2 ch2 = (Py_UCS2) ch;
649 if (ch2 == ch)
650 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
651 else
652 return -1;
653 }
654 case PyUnicode_4BYTE_KIND:
655 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
656 default:
657 assert(0);
658 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200660}
661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662static PyObject*
663resize_compact(PyObject *unicode, Py_ssize_t length)
664{
665 Py_ssize_t char_size;
666 Py_ssize_t struct_size;
667 Py_ssize_t new_size;
668 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100669 PyObject *new_unicode;
Victor Stinner79891572012-05-03 13:43:07 +0200670 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200671 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100672 assert(PyUnicode_IS_COMPACT(unicode));
673
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200674 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100675 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 struct_size = sizeof(PyASCIIObject);
677 else
678 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200679 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
682 PyErr_NoMemory();
683 return NULL;
684 }
685 new_size = (struct_size + (length + 1) * char_size);
686
Victor Stinner84def372011-12-11 20:04:56 +0100687 _Py_DEC_REFTOTAL;
688 _Py_ForgetReference(unicode);
689
690 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
691 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100692 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 PyErr_NoMemory();
694 return NULL;
695 }
Victor Stinner84def372011-12-11 20:04:56 +0100696 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100698
Victor Stinnerfe226c02011-10-03 03:52:20 +0200699 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200700 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200701 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100702 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200703 _PyUnicode_WSTR_LENGTH(unicode) = length;
704 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200705 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
706 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200707 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200708 return unicode;
709}
710
Alexander Belopolsky40018472011-02-26 01:02:56 +0000711static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200712resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000713{
Victor Stinner95663112011-10-04 01:03:50 +0200714 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100715 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200716 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200717 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000718
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 if (PyUnicode_IS_READY(unicode)) {
720 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200721 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200722 void *data;
723
724 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200725 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200726 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
727 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200728
729 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
730 PyErr_NoMemory();
731 return -1;
732 }
733 new_size = (length + 1) * char_size;
734
Victor Stinner7a9105a2011-12-12 00:13:42 +0100735 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
736 {
737 PyObject_DEL(_PyUnicode_UTF8(unicode));
738 _PyUnicode_UTF8(unicode) = NULL;
739 _PyUnicode_UTF8_LENGTH(unicode) = 0;
740 }
741
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742 data = (PyObject *)PyObject_REALLOC(data, new_size);
743 if (data == NULL) {
744 PyErr_NoMemory();
745 return -1;
746 }
747 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200748 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200749 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200750 _PyUnicode_WSTR_LENGTH(unicode) = length;
751 }
752 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200753 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200754 _PyUnicode_UTF8_LENGTH(unicode) = length;
755 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200756 _PyUnicode_LENGTH(unicode) = length;
757 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200758 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200759 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200761 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200762 }
Victor Stinner95663112011-10-04 01:03:50 +0200763 assert(_PyUnicode_WSTR(unicode) != NULL);
764
765 /* check for integer overflow */
766 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
767 PyErr_NoMemory();
768 return -1;
769 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100770 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200771 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100772 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200773 if (!wstr) {
774 PyErr_NoMemory();
775 return -1;
776 }
777 _PyUnicode_WSTR(unicode) = wstr;
778 _PyUnicode_WSTR(unicode)[length] = 0;
779 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200780 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000781 return 0;
782}
783
Victor Stinnerfe226c02011-10-03 03:52:20 +0200784static PyObject*
785resize_copy(PyObject *unicode, Py_ssize_t length)
786{
787 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100788 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100790
Benjamin Petersonbac79492012-01-14 13:34:47 -0500791 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100792 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200793
794 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
795 if (copy == NULL)
796 return NULL;
797
798 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200799 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200800 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200801 }
802 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200803 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100804
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200805 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200806 if (w == NULL)
807 return NULL;
808 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
809 copy_length = Py_MIN(copy_length, length);
810 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
811 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200812 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 }
814}
815
Guido van Rossumd57fd912000-03-10 22:53:23 +0000816/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000817 Ux0000 terminated; some code (e.g. new_identifier)
818 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000819
820 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000821 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000822
823*/
824
Alexander Belopolsky40018472011-02-26 01:02:56 +0000825static PyUnicodeObject *
826_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000827{
828 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200829 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000830
Thomas Wouters477c8d52006-05-27 19:21:47 +0000831 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000832 if (length == 0 && unicode_empty != NULL) {
833 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200834 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000835 }
836
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000837 /* Ensure we won't overflow the size. */
838 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
839 return (PyUnicodeObject *)PyErr_NoMemory();
840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200841 if (length < 0) {
842 PyErr_SetString(PyExc_SystemError,
843 "Negative size passed to _PyUnicode_New");
844 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000845 }
846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200847 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
848 if (unicode == NULL)
849 return NULL;
850 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
851 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
852 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100853 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000854 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100855 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000856 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200857
Jeremy Hyltond8082792003-09-16 19:41:39 +0000858 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000859 * the caller fails before initializing str -- unicode_resize()
860 * reads str[0], and the Keep-Alive optimization can keep memory
861 * allocated for str alive across a call to unicode_dealloc(unicode).
862 * We don't want unicode_resize to read uninitialized memory in
863 * that case.
864 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865 _PyUnicode_WSTR(unicode)[0] = 0;
866 _PyUnicode_WSTR(unicode)[length] = 0;
867 _PyUnicode_WSTR_LENGTH(unicode) = length;
868 _PyUnicode_HASH(unicode) = -1;
869 _PyUnicode_STATE(unicode).interned = 0;
870 _PyUnicode_STATE(unicode).kind = 0;
871 _PyUnicode_STATE(unicode).compact = 0;
872 _PyUnicode_STATE(unicode).ready = 0;
873 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200874 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200875 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200876 _PyUnicode_UTF8(unicode) = NULL;
877 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100878 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000879 return unicode;
880}
881
Victor Stinnerf42dc442011-10-02 23:33:16 +0200882static const char*
883unicode_kind_name(PyObject *unicode)
884{
Victor Stinner42dfd712011-10-03 14:41:45 +0200885 /* don't check consistency: unicode_kind_name() is called from
886 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887 if (!PyUnicode_IS_COMPACT(unicode))
888 {
889 if (!PyUnicode_IS_READY(unicode))
890 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600891 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 {
893 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200894 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200895 return "legacy ascii";
896 else
897 return "legacy latin1";
898 case PyUnicode_2BYTE_KIND:
899 return "legacy UCS2";
900 case PyUnicode_4BYTE_KIND:
901 return "legacy UCS4";
902 default:
903 return "<legacy invalid kind>";
904 }
905 }
906 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600907 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200908 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200909 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200910 return "ascii";
911 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200912 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200913 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200914 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200915 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200916 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200917 default:
918 return "<invalid compact kind>";
919 }
920}
921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200922#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923/* Functions wrapping macros for use in debugger */
924char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200925 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200926}
927
928void *_PyUnicode_compact_data(void *unicode) {
929 return _PyUnicode_COMPACT_DATA(unicode);
930}
931void *_PyUnicode_data(void *unicode){
932 printf("obj %p\n", unicode);
933 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
934 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
935 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
936 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
937 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
938 return PyUnicode_DATA(unicode);
939}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200940
941void
942_PyUnicode_Dump(PyObject *op)
943{
944 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200945 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
946 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
947 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200948
Victor Stinnera849a4b2011-10-03 12:12:11 +0200949 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200950 {
951 if (ascii->state.ascii)
952 data = (ascii + 1);
953 else
954 data = (compact + 1);
955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 else
957 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200958 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
959
Victor Stinnera849a4b2011-10-03 12:12:11 +0200960 if (ascii->wstr == data)
961 printf("shared ");
962 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200963
Victor Stinnera3b334d2011-10-03 13:53:37 +0200964 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200965 printf(" (%zu), ", compact->wstr_length);
966 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
967 printf("shared ");
968 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200969 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200970 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200971}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972#endif
973
974PyObject *
975PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
976{
977 PyObject *obj;
978 PyCompactUnicodeObject *unicode;
979 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200980 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982 Py_ssize_t char_size;
983 Py_ssize_t struct_size;
984
985 /* Optimization for empty strings */
986 if (size == 0 && unicode_empty != NULL) {
987 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200988 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989 }
990
Victor Stinner9e9d6892011-10-04 01:02:02 +0200991 is_ascii = 0;
992 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200993 struct_size = sizeof(PyCompactUnicodeObject);
994 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200995 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200996 char_size = 1;
997 is_ascii = 1;
998 struct_size = sizeof(PyASCIIObject);
999 }
1000 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001001 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001002 char_size = 1;
1003 }
1004 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001005 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001006 char_size = 2;
1007 if (sizeof(wchar_t) == 2)
1008 is_sharing = 1;
1009 }
1010 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001011 if (maxchar > MAX_UNICODE) {
1012 PyErr_SetString(PyExc_SystemError,
1013 "invalid maximum character passed to PyUnicode_New");
1014 return NULL;
1015 }
Victor Stinner8f825062012-04-27 13:55:39 +02001016 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001017 char_size = 4;
1018 if (sizeof(wchar_t) == 4)
1019 is_sharing = 1;
1020 }
1021
1022 /* Ensure we won't overflow the size. */
1023 if (size < 0) {
1024 PyErr_SetString(PyExc_SystemError,
1025 "Negative size passed to PyUnicode_New");
1026 return NULL;
1027 }
1028 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1029 return PyErr_NoMemory();
1030
1031 /* Duplicated allocation code from _PyObject_New() instead of a call to
1032 * PyObject_New() so we are able to allocate space for the object and
1033 * it's data buffer.
1034 */
1035 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1036 if (obj == NULL)
1037 return PyErr_NoMemory();
1038 obj = PyObject_INIT(obj, &PyUnicode_Type);
1039 if (obj == NULL)
1040 return NULL;
1041
1042 unicode = (PyCompactUnicodeObject *)obj;
1043 if (is_ascii)
1044 data = ((PyASCIIObject*)obj) + 1;
1045 else
1046 data = unicode + 1;
1047 _PyUnicode_LENGTH(unicode) = size;
1048 _PyUnicode_HASH(unicode) = -1;
1049 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001050 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 _PyUnicode_STATE(unicode).compact = 1;
1052 _PyUnicode_STATE(unicode).ready = 1;
1053 _PyUnicode_STATE(unicode).ascii = is_ascii;
1054 if (is_ascii) {
1055 ((char*)data)[size] = 0;
1056 _PyUnicode_WSTR(unicode) = NULL;
1057 }
Victor Stinner8f825062012-04-27 13:55:39 +02001058 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059 ((char*)data)[size] = 0;
1060 _PyUnicode_WSTR(unicode) = NULL;
1061 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001063 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001064 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 else {
1066 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001067 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001068 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001070 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001071 ((Py_UCS4*)data)[size] = 0;
1072 if (is_sharing) {
1073 _PyUnicode_WSTR_LENGTH(unicode) = size;
1074 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1075 }
1076 else {
1077 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1078 _PyUnicode_WSTR(unicode) = NULL;
1079 }
1080 }
Victor Stinner8f825062012-04-27 13:55:39 +02001081#ifdef Py_DEBUG
1082 /* Fill the data with invalid characters to detect bugs earlier.
1083 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1084 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1085 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1086 memset(data, 0xff, size * kind);
1087#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001088 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001089 return obj;
1090}
1091
1092#if SIZEOF_WCHAR_T == 2
1093/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1094 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001095 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001096
1097 This function assumes that unicode can hold one more code point than wstr
1098 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001099static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001101 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001102{
1103 const wchar_t *iter;
1104 Py_UCS4 *ucs4_out;
1105
Victor Stinner910337b2011-10-03 03:20:16 +02001106 assert(unicode != NULL);
1107 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001108 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1109 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1110
1111 for (iter = begin; iter < end; ) {
1112 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1113 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001114 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1115 && (iter+1) < end
1116 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 {
Victor Stinner551ac952011-11-29 22:58:13 +01001118 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 iter += 2;
1120 }
1121 else {
1122 *ucs4_out++ = *iter;
1123 iter++;
1124 }
1125 }
1126 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1127 _PyUnicode_GET_LENGTH(unicode)));
1128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129}
1130#endif
1131
Victor Stinnercd9950f2011-10-02 00:34:53 +02001132static int
Victor Stinner488fa492011-12-12 00:01:39 +01001133unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001134{
Victor Stinner488fa492011-12-12 00:01:39 +01001135 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001136 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001137 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001138 return -1;
1139 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001140 return 0;
1141}
1142
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001143static int
1144_copy_characters(PyObject *to, Py_ssize_t to_start,
1145 PyObject *from, Py_ssize_t from_start,
1146 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001148 unsigned int from_kind, to_kind;
1149 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150
Victor Stinneree4544c2012-05-09 22:24:08 +02001151 assert(0 <= how_many);
1152 assert(0 <= from_start);
1153 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001154 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001155 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001156 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001157
Victor Stinnerd3f08822012-05-29 12:57:52 +02001158 assert(PyUnicode_Check(to));
1159 assert(PyUnicode_IS_READY(to));
1160 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1161
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001162 if (how_many == 0)
1163 return 0;
1164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001166 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001168 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169
Victor Stinnerf1852262012-06-16 16:38:26 +02001170#ifdef Py_DEBUG
1171 if (!check_maxchar
1172 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1173 {
1174 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1175 Py_UCS4 ch;
1176 Py_ssize_t i;
1177 for (i=0; i < how_many; i++) {
1178 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1179 assert(ch <= to_maxchar);
1180 }
1181 }
1182#endif
1183
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001184 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001185 if (check_maxchar
1186 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1187 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001188 /* Writing Latin-1 characters into an ASCII string requires to
1189 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001190 Py_UCS4 max_char;
1191 max_char = ucs1lib_find_max_char(from_data,
1192 (Py_UCS1*)from_data + how_many);
1193 if (max_char >= 128)
1194 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001195 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001196 Py_MEMCPY((char*)to_data + to_kind * to_start,
1197 (char*)from_data + from_kind * from_start,
1198 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001200 else if (from_kind == PyUnicode_1BYTE_KIND
1201 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001202 {
1203 _PyUnicode_CONVERT_BYTES(
1204 Py_UCS1, Py_UCS2,
1205 PyUnicode_1BYTE_DATA(from) + from_start,
1206 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1207 PyUnicode_2BYTE_DATA(to) + to_start
1208 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001209 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001210 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001211 && to_kind == PyUnicode_4BYTE_KIND)
1212 {
1213 _PyUnicode_CONVERT_BYTES(
1214 Py_UCS1, Py_UCS4,
1215 PyUnicode_1BYTE_DATA(from) + from_start,
1216 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1217 PyUnicode_4BYTE_DATA(to) + to_start
1218 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001219 }
1220 else if (from_kind == PyUnicode_2BYTE_KIND
1221 && to_kind == PyUnicode_4BYTE_KIND)
1222 {
1223 _PyUnicode_CONVERT_BYTES(
1224 Py_UCS2, Py_UCS4,
1225 PyUnicode_2BYTE_DATA(from) + from_start,
1226 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1227 PyUnicode_4BYTE_DATA(to) + to_start
1228 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001229 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001230 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001231 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1232
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001233 if (!check_maxchar) {
1234 if (from_kind == PyUnicode_2BYTE_KIND
1235 && to_kind == PyUnicode_1BYTE_KIND)
1236 {
1237 _PyUnicode_CONVERT_BYTES(
1238 Py_UCS2, Py_UCS1,
1239 PyUnicode_2BYTE_DATA(from) + from_start,
1240 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1241 PyUnicode_1BYTE_DATA(to) + to_start
1242 );
1243 }
1244 else if (from_kind == PyUnicode_4BYTE_KIND
1245 && to_kind == PyUnicode_1BYTE_KIND)
1246 {
1247 _PyUnicode_CONVERT_BYTES(
1248 Py_UCS4, Py_UCS1,
1249 PyUnicode_4BYTE_DATA(from) + from_start,
1250 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1251 PyUnicode_1BYTE_DATA(to) + to_start
1252 );
1253 }
1254 else if (from_kind == PyUnicode_4BYTE_KIND
1255 && to_kind == PyUnicode_2BYTE_KIND)
1256 {
1257 _PyUnicode_CONVERT_BYTES(
1258 Py_UCS4, Py_UCS2,
1259 PyUnicode_4BYTE_DATA(from) + from_start,
1260 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1261 PyUnicode_2BYTE_DATA(to) + to_start
1262 );
1263 }
1264 else {
1265 assert(0);
1266 return -1;
1267 }
1268 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001269 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001270 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001271 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001272 Py_ssize_t i;
1273
Victor Stinnera0702ab2011-09-29 14:14:38 +02001274 for (i=0; i < how_many; i++) {
1275 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001276 if (ch > to_maxchar)
1277 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001278 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1279 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001280 }
1281 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001282 return 0;
1283}
1284
Victor Stinnerd3f08822012-05-29 12:57:52 +02001285void
1286_PyUnicode_FastCopyCharacters(
1287 PyObject *to, Py_ssize_t to_start,
1288 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001289{
1290 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1291}
1292
1293Py_ssize_t
1294PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1295 PyObject *from, Py_ssize_t from_start,
1296 Py_ssize_t how_many)
1297{
1298 int err;
1299
1300 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1301 PyErr_BadInternalCall();
1302 return -1;
1303 }
1304
Benjamin Petersonbac79492012-01-14 13:34:47 -05001305 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001306 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001307 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001308 return -1;
1309
Victor Stinnerd3f08822012-05-29 12:57:52 +02001310 if (from_start < 0) {
1311 PyErr_SetString(PyExc_IndexError, "string index out of range");
1312 return -1;
1313 }
1314 if (to_start < 0) {
1315 PyErr_SetString(PyExc_IndexError, "string index out of range");
1316 return -1;
1317 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001318 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1319 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1320 PyErr_Format(PyExc_SystemError,
1321 "Cannot write %zi characters at %zi "
1322 "in a string of %zi characters",
1323 how_many, to_start, PyUnicode_GET_LENGTH(to));
1324 return -1;
1325 }
1326
1327 if (how_many == 0)
1328 return 0;
1329
Victor Stinner488fa492011-12-12 00:01:39 +01001330 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001331 return -1;
1332
1333 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1334 if (err) {
1335 PyErr_Format(PyExc_SystemError,
1336 "Cannot copy %s characters "
1337 "into a string of %s characters",
1338 unicode_kind_name(from),
1339 unicode_kind_name(to));
1340 return -1;
1341 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001342 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343}
1344
Victor Stinner17222162011-09-28 22:15:37 +02001345/* Find the maximum code point and count the number of surrogate pairs so a
1346 correct string length can be computed before converting a string to UCS4.
1347 This function counts single surrogates as a character and not as a pair.
1348
1349 Return 0 on success, or -1 on error. */
1350static int
1351find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1352 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353{
1354 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001355 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001356
Victor Stinnerc53be962011-10-02 21:33:54 +02001357 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358 *num_surrogates = 0;
1359 *maxchar = 0;
1360
1361 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001363 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1364 && (iter+1) < end
1365 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001367 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369 iter += 2;
1370 }
1371 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001373 {
1374 ch = *iter;
1375 iter++;
1376 }
1377 if (ch > *maxchar) {
1378 *maxchar = ch;
1379 if (*maxchar > MAX_UNICODE) {
1380 PyErr_Format(PyExc_ValueError,
1381 "character U+%x is not in range [U+0000; U+10ffff]",
1382 ch);
1383 return -1;
1384 }
1385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 }
1387 return 0;
1388}
1389
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001390int
1391_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392{
1393 wchar_t *end;
1394 Py_UCS4 maxchar = 0;
1395 Py_ssize_t num_surrogates;
1396#if SIZEOF_WCHAR_T == 2
1397 Py_ssize_t length_wo_surrogates;
1398#endif
1399
Georg Brandl7597add2011-10-05 16:36:47 +02001400 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001401 strings were created using _PyObject_New() and where no canonical
1402 representation (the str field) has been set yet aka strings
1403 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001404 assert(_PyUnicode_CHECK(unicode));
1405 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001407 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001408 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001409 /* Actually, it should neither be interned nor be anything else: */
1410 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001413 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001414 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416
1417 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001418 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1419 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 PyErr_NoMemory();
1421 return -1;
1422 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001423 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 _PyUnicode_WSTR(unicode), end,
1425 PyUnicode_1BYTE_DATA(unicode));
1426 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1427 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1428 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1429 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001430 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001431 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001432 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 }
1434 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001435 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001436 _PyUnicode_UTF8(unicode) = NULL;
1437 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 PyObject_FREE(_PyUnicode_WSTR(unicode));
1440 _PyUnicode_WSTR(unicode) = NULL;
1441 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1442 }
1443 /* In this case we might have to convert down from 4-byte native
1444 wchar_t to 2-byte unicode. */
1445 else if (maxchar < 65536) {
1446 assert(num_surrogates == 0 &&
1447 "FindMaxCharAndNumSurrogatePairs() messed up");
1448
Victor Stinner506f5922011-09-28 22:34:18 +02001449#if SIZEOF_WCHAR_T == 2
1450 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001451 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001452 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1453 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1454 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001455 _PyUnicode_UTF8(unicode) = NULL;
1456 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001457#else
1458 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001459 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001460 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001461 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001462 PyErr_NoMemory();
1463 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 }
Victor Stinner506f5922011-09-28 22:34:18 +02001465 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1466 _PyUnicode_WSTR(unicode), end,
1467 PyUnicode_2BYTE_DATA(unicode));
1468 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1469 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1470 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001471 _PyUnicode_UTF8(unicode) = NULL;
1472 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001473 PyObject_FREE(_PyUnicode_WSTR(unicode));
1474 _PyUnicode_WSTR(unicode) = NULL;
1475 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1476#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477 }
1478 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1479 else {
1480#if SIZEOF_WCHAR_T == 2
1481 /* in case the native representation is 2-bytes, we need to allocate a
1482 new normalized 4-byte version. */
1483 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001484 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1485 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486 PyErr_NoMemory();
1487 return -1;
1488 }
1489 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1490 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001491 _PyUnicode_UTF8(unicode) = NULL;
1492 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001493 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1494 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001495 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001496 PyObject_FREE(_PyUnicode_WSTR(unicode));
1497 _PyUnicode_WSTR(unicode) = NULL;
1498 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1499#else
1500 assert(num_surrogates == 0);
1501
Victor Stinnerc3c74152011-10-02 20:39:55 +02001502 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001503 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001504 _PyUnicode_UTF8(unicode) = NULL;
1505 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001506 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1507#endif
1508 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1509 }
1510 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001511 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001512 return 0;
1513}
1514
Alexander Belopolsky40018472011-02-26 01:02:56 +00001515static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001516unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001517{
Walter Dörwald16807132007-05-25 13:52:07 +00001518 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001519 case SSTATE_NOT_INTERNED:
1520 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001521
Benjamin Peterson29060642009-01-31 22:14:21 +00001522 case SSTATE_INTERNED_MORTAL:
1523 /* revive dead object temporarily for DelItem */
1524 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001525 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001526 Py_FatalError(
1527 "deletion of interned string failed");
1528 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001529
Benjamin Peterson29060642009-01-31 22:14:21 +00001530 case SSTATE_INTERNED_IMMORTAL:
1531 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001532
Benjamin Peterson29060642009-01-31 22:14:21 +00001533 default:
1534 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001535 }
1536
Victor Stinner03490912011-10-03 23:45:12 +02001537 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001539 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001540 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001541 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1542 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001543
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001544 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001545}
1546
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001547#ifdef Py_DEBUG
1548static int
1549unicode_is_singleton(PyObject *unicode)
1550{
1551 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1552 if (unicode == unicode_empty)
1553 return 1;
1554 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1555 {
1556 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1557 if (ch < 256 && unicode_latin1[ch] == unicode)
1558 return 1;
1559 }
1560 return 0;
1561}
1562#endif
1563
Alexander Belopolsky40018472011-02-26 01:02:56 +00001564static int
Victor Stinner488fa492011-12-12 00:01:39 +01001565unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001566{
Victor Stinner488fa492011-12-12 00:01:39 +01001567 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001568 if (Py_REFCNT(unicode) != 1)
1569 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001570 if (_PyUnicode_HASH(unicode) != -1)
1571 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001572 if (PyUnicode_CHECK_INTERNED(unicode))
1573 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001574 if (!PyUnicode_CheckExact(unicode))
1575 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001576#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001577 /* singleton refcount is greater than 1 */
1578 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001579#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 return 1;
1581}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001582
Victor Stinnerfe226c02011-10-03 03:52:20 +02001583static int
1584unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1585{
1586 PyObject *unicode;
1587 Py_ssize_t old_length;
1588
1589 assert(p_unicode != NULL);
1590 unicode = *p_unicode;
1591
1592 assert(unicode != NULL);
1593 assert(PyUnicode_Check(unicode));
1594 assert(0 <= length);
1595
Victor Stinner910337b2011-10-03 03:20:16 +02001596 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001597 old_length = PyUnicode_WSTR_LENGTH(unicode);
1598 else
1599 old_length = PyUnicode_GET_LENGTH(unicode);
1600 if (old_length == length)
1601 return 0;
1602
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001603 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001604 _Py_INCREF_UNICODE_EMPTY();
1605 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001606 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001607 Py_DECREF(*p_unicode);
1608 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001609 return 0;
1610 }
1611
Victor Stinner488fa492011-12-12 00:01:39 +01001612 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001613 PyObject *copy = resize_copy(unicode, length);
1614 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001615 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001616 Py_DECREF(*p_unicode);
1617 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001618 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001619 }
1620
Victor Stinnerfe226c02011-10-03 03:52:20 +02001621 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001622 PyObject *new_unicode = resize_compact(unicode, length);
1623 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001625 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001626 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001627 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001628 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001629}
1630
Alexander Belopolsky40018472011-02-26 01:02:56 +00001631int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001632PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001633{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001634 PyObject *unicode;
1635 if (p_unicode == NULL) {
1636 PyErr_BadInternalCall();
1637 return -1;
1638 }
1639 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001640 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001641 {
1642 PyErr_BadInternalCall();
1643 return -1;
1644 }
1645 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001646}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001647
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001648static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001649unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1650 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001651{
1652 PyObject *result;
1653 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001654 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1656 return 0;
1657 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1658 maxchar);
1659 if (result == NULL)
1660 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001661 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001662 Py_DECREF(*p_unicode);
1663 *p_unicode = result;
1664 return 0;
1665}
1666
1667static int
1668unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1669 Py_UCS4 ch)
1670{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001671 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001672 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001673 return -1;
1674 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1675 PyUnicode_DATA(*p_unicode),
1676 (*pos)++, ch);
1677 return 0;
1678}
1679
Victor Stinnerc5166102012-02-22 13:55:02 +01001680/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001681
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001682 WARNING: The function doesn't copy the terminating null character and
1683 doesn't check the maximum character (may write a latin1 character in an
1684 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001685static void
1686unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1687 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001688{
1689 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1690 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001691 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001692
1693 switch (kind) {
1694 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001695 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001696 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001697 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001698 }
1699 case PyUnicode_2BYTE_KIND: {
1700 Py_UCS2 *start = (Py_UCS2 *)data + index;
1701 Py_UCS2 *ucs2 = start;
1702 assert(index <= PyUnicode_GET_LENGTH(unicode));
1703
Victor Stinner184252a2012-06-16 02:57:41 +02001704 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001705 *ucs2 = (Py_UCS2)*str;
1706
1707 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001708 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001709 }
1710 default: {
1711 Py_UCS4 *start = (Py_UCS4 *)data + index;
1712 Py_UCS4 *ucs4 = start;
1713 assert(kind == PyUnicode_4BYTE_KIND);
1714 assert(index <= PyUnicode_GET_LENGTH(unicode));
1715
Victor Stinner184252a2012-06-16 02:57:41 +02001716 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001717 *ucs4 = (Py_UCS4)*str;
1718
1719 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001720 }
1721 }
1722}
1723
1724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725static PyObject*
1726get_latin1_char(unsigned char ch)
1727{
Victor Stinnera464fc12011-10-02 20:39:30 +02001728 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001730 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 if (!unicode)
1732 return NULL;
1733 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001734 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 unicode_latin1[ch] = unicode;
1736 }
1737 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001738 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739}
1740
Alexander Belopolsky40018472011-02-26 01:02:56 +00001741PyObject *
1742PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001744 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 Py_UCS4 maxchar = 0;
1746 Py_ssize_t num_surrogates;
1747
1748 if (u == NULL)
1749 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001751 /* If the Unicode data is known at construction time, we can apply
1752 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001755 if (size == 0)
1756 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 /* Single character Unicode objects in the Latin-1 range are
1759 shared when using this constructor */
1760 if (size == 1 && *u < 256)
1761 return get_latin1_char((unsigned char)*u);
1762
1763 /* If not empty and not single character, copy the Unicode data
1764 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001765 if (find_maxchar_surrogates(u, u + size,
1766 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 return NULL;
1768
Victor Stinner8faf8212011-12-08 22:14:11 +01001769 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770 if (!unicode)
1771 return NULL;
1772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 switch (PyUnicode_KIND(unicode)) {
1774 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001775 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1777 break;
1778 case PyUnicode_2BYTE_KIND:
1779#if Py_UNICODE_SIZE == 2
1780 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1781#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001782 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1784#endif
1785 break;
1786 case PyUnicode_4BYTE_KIND:
1787#if SIZEOF_WCHAR_T == 2
1788 /* This is the only case which has to process surrogates, thus
1789 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001790 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791#else
1792 assert(num_surrogates == 0);
1793 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1794#endif
1795 break;
1796 default:
1797 assert(0 && "Impossible state");
1798 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001800 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801}
1802
Alexander Belopolsky40018472011-02-26 01:02:56 +00001803PyObject *
1804PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001805{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001806 if (size < 0) {
1807 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001808 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001809 return NULL;
1810 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001811 if (u != NULL)
1812 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1813 else
1814 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001815}
1816
Alexander Belopolsky40018472011-02-26 01:02:56 +00001817PyObject *
1818PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001819{
1820 size_t size = strlen(u);
1821 if (size > PY_SSIZE_T_MAX) {
1822 PyErr_SetString(PyExc_OverflowError, "input too long");
1823 return NULL;
1824 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001825 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001826}
1827
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001828PyObject *
1829_PyUnicode_FromId(_Py_Identifier *id)
1830{
1831 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001832 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1833 strlen(id->string),
1834 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001835 if (!id->object)
1836 return NULL;
1837 PyUnicode_InternInPlace(&id->object);
1838 assert(!id->next);
1839 id->next = static_strings;
1840 static_strings = id;
1841 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001842 return id->object;
1843}
1844
1845void
1846_PyUnicode_ClearStaticStrings()
1847{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001848 _Py_Identifier *tmp, *s = static_strings;
1849 while (s) {
1850 Py_DECREF(s->object);
1851 s->object = NULL;
1852 tmp = s->next;
1853 s->next = NULL;
1854 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001855 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001856 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001857}
1858
Benjamin Peterson0df54292012-03-26 14:50:32 -04001859/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001860
Victor Stinnerd3f08822012-05-29 12:57:52 +02001861PyObject*
1862_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001863{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001864 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001865 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001866 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001867#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001868 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001869#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001870 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001871 }
Victor Stinner785938e2011-12-11 20:09:03 +01001872 unicode = PyUnicode_New(size, 127);
1873 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001874 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001875 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1876 assert(_PyUnicode_CheckConsistency(unicode, 1));
1877 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001878}
1879
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001880static Py_UCS4
1881kind_maxchar_limit(unsigned int kind)
1882{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001883 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001884 case PyUnicode_1BYTE_KIND:
1885 return 0x80;
1886 case PyUnicode_2BYTE_KIND:
1887 return 0x100;
1888 case PyUnicode_4BYTE_KIND:
1889 return 0x10000;
1890 default:
1891 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001892 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001893 }
1894}
1895
Victor Stinnere6abb482012-05-02 01:15:40 +02001896Py_LOCAL_INLINE(Py_UCS4)
1897align_maxchar(Py_UCS4 maxchar)
1898{
1899 if (maxchar <= 127)
1900 return 127;
1901 else if (maxchar <= 255)
1902 return 255;
1903 else if (maxchar <= 65535)
1904 return 65535;
1905 else
1906 return MAX_UNICODE;
1907}
1908
Victor Stinner702c7342011-10-05 13:50:52 +02001909static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001910_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001913 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001914
Serhiy Storchaka678db842013-01-26 12:16:36 +02001915 if (size == 0)
1916 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001917 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001918 if (size == 1)
1919 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001920
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001921 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001922 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 if (!res)
1924 return NULL;
1925 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001926 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001927 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001928}
1929
Victor Stinnere57b1c02011-09-28 22:20:48 +02001930static PyObject*
1931_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932{
1933 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001934 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001935
Serhiy Storchaka678db842013-01-26 12:16:36 +02001936 if (size == 0)
1937 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001938 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001939 if (size == 1) {
1940 Py_UCS4 ch = u[0];
1941 if (ch < 256)
1942 return get_latin1_char((unsigned char)ch);
1943
1944 res = PyUnicode_New(1, ch);
1945 if (res == NULL)
1946 return NULL;
1947 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1948 assert(_PyUnicode_CheckConsistency(res, 1));
1949 return res;
1950 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001951
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001952 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001953 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 if (!res)
1955 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001956 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001958 else {
1959 _PyUnicode_CONVERT_BYTES(
1960 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1961 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001962 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963 return res;
1964}
1965
Victor Stinnere57b1c02011-09-28 22:20:48 +02001966static PyObject*
1967_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968{
1969 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001970 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001971
Serhiy Storchaka678db842013-01-26 12:16:36 +02001972 if (size == 0)
1973 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001974 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001975 if (size == 1) {
1976 Py_UCS4 ch = u[0];
1977 if (ch < 256)
1978 return get_latin1_char((unsigned char)ch);
1979
1980 res = PyUnicode_New(1, ch);
1981 if (res == NULL)
1982 return NULL;
1983 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1984 assert(_PyUnicode_CheckConsistency(res, 1));
1985 return res;
1986 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001987
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001988 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001989 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 if (!res)
1991 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001992 if (max_char < 256)
1993 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1994 PyUnicode_1BYTE_DATA(res));
1995 else if (max_char < 0x10000)
1996 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1997 PyUnicode_2BYTE_DATA(res));
1998 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002000 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 return res;
2002}
2003
2004PyObject*
2005PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2006{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002007 if (size < 0) {
2008 PyErr_SetString(PyExc_ValueError, "size must be positive");
2009 return NULL;
2010 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002011 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002013 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002015 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002017 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002018 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002019 PyErr_SetString(PyExc_SystemError, "invalid kind");
2020 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022}
2023
Victor Stinnerece58de2012-04-23 23:36:38 +02002024Py_UCS4
2025_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2026{
2027 enum PyUnicode_Kind kind;
2028 void *startptr, *endptr;
2029
2030 assert(PyUnicode_IS_READY(unicode));
2031 assert(0 <= start);
2032 assert(end <= PyUnicode_GET_LENGTH(unicode));
2033 assert(start <= end);
2034
2035 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2036 return PyUnicode_MAX_CHAR_VALUE(unicode);
2037
2038 if (start == end)
2039 return 127;
2040
Victor Stinner94d558b2012-04-27 22:26:58 +02002041 if (PyUnicode_IS_ASCII(unicode))
2042 return 127;
2043
Victor Stinnerece58de2012-04-23 23:36:38 +02002044 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002045 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002046 endptr = (char *)startptr + end * kind;
2047 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002048 switch(kind) {
2049 case PyUnicode_1BYTE_KIND:
2050 return ucs1lib_find_max_char(startptr, endptr);
2051 case PyUnicode_2BYTE_KIND:
2052 return ucs2lib_find_max_char(startptr, endptr);
2053 case PyUnicode_4BYTE_KIND:
2054 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002055 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002056 assert(0);
2057 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002058 }
2059}
2060
Victor Stinner25a4b292011-10-06 12:31:55 +02002061/* Ensure that a string uses the most efficient storage, if it is not the
2062 case: create a new string with of the right kind. Write NULL into *p_unicode
2063 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002064static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002065unicode_adjust_maxchar(PyObject **p_unicode)
2066{
2067 PyObject *unicode, *copy;
2068 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002069 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002070 unsigned int kind;
2071
2072 assert(p_unicode != NULL);
2073 unicode = *p_unicode;
2074 assert(PyUnicode_IS_READY(unicode));
2075 if (PyUnicode_IS_ASCII(unicode))
2076 return;
2077
2078 len = PyUnicode_GET_LENGTH(unicode);
2079 kind = PyUnicode_KIND(unicode);
2080 if (kind == PyUnicode_1BYTE_KIND) {
2081 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002082 max_char = ucs1lib_find_max_char(u, u + len);
2083 if (max_char >= 128)
2084 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002085 }
2086 else if (kind == PyUnicode_2BYTE_KIND) {
2087 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002088 max_char = ucs2lib_find_max_char(u, u + len);
2089 if (max_char >= 256)
2090 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002091 }
2092 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002093 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002094 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002095 max_char = ucs4lib_find_max_char(u, u + len);
2096 if (max_char >= 0x10000)
2097 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002098 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002099 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002100 if (copy != NULL)
2101 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002102 Py_DECREF(unicode);
2103 *p_unicode = copy;
2104}
2105
Victor Stinner034f6cf2011-09-30 02:26:44 +02002106PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002107_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002108{
Victor Stinner87af4f22011-11-21 23:03:47 +01002109 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002110 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002111
Victor Stinner034f6cf2011-09-30 02:26:44 +02002112 if (!PyUnicode_Check(unicode)) {
2113 PyErr_BadInternalCall();
2114 return NULL;
2115 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002116 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002117 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002118
Victor Stinner87af4f22011-11-21 23:03:47 +01002119 length = PyUnicode_GET_LENGTH(unicode);
2120 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002121 if (!copy)
2122 return NULL;
2123 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2124
Victor Stinner87af4f22011-11-21 23:03:47 +01002125 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2126 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002127 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002128 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002129}
2130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131
Victor Stinnerbc603d12011-10-02 01:00:40 +02002132/* Widen Unicode objects to larger buffers. Don't write terminating null
2133 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134
2135void*
2136_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2137{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002138 Py_ssize_t len;
2139 void *result;
2140 unsigned int skind;
2141
Benjamin Petersonbac79492012-01-14 13:34:47 -05002142 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002143 return NULL;
2144
2145 len = PyUnicode_GET_LENGTH(s);
2146 skind = PyUnicode_KIND(s);
2147 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002148 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149 return NULL;
2150 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002151 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002152 case PyUnicode_2BYTE_KIND:
2153 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2154 if (!result)
2155 return PyErr_NoMemory();
2156 assert(skind == PyUnicode_1BYTE_KIND);
2157 _PyUnicode_CONVERT_BYTES(
2158 Py_UCS1, Py_UCS2,
2159 PyUnicode_1BYTE_DATA(s),
2160 PyUnicode_1BYTE_DATA(s) + len,
2161 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002163 case PyUnicode_4BYTE_KIND:
2164 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2165 if (!result)
2166 return PyErr_NoMemory();
2167 if (skind == PyUnicode_2BYTE_KIND) {
2168 _PyUnicode_CONVERT_BYTES(
2169 Py_UCS2, Py_UCS4,
2170 PyUnicode_2BYTE_DATA(s),
2171 PyUnicode_2BYTE_DATA(s) + len,
2172 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002174 else {
2175 assert(skind == PyUnicode_1BYTE_KIND);
2176 _PyUnicode_CONVERT_BYTES(
2177 Py_UCS1, Py_UCS4,
2178 PyUnicode_1BYTE_DATA(s),
2179 PyUnicode_1BYTE_DATA(s) + len,
2180 result);
2181 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002183 default:
2184 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 }
Victor Stinner01698042011-10-04 00:04:26 +02002186 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002187 return NULL;
2188}
2189
2190static Py_UCS4*
2191as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2192 int copy_null)
2193{
2194 int kind;
2195 void *data;
2196 Py_ssize_t len, targetlen;
2197 if (PyUnicode_READY(string) == -1)
2198 return NULL;
2199 kind = PyUnicode_KIND(string);
2200 data = PyUnicode_DATA(string);
2201 len = PyUnicode_GET_LENGTH(string);
2202 targetlen = len;
2203 if (copy_null)
2204 targetlen++;
2205 if (!target) {
2206 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2207 PyErr_NoMemory();
2208 return NULL;
2209 }
2210 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2211 if (!target) {
2212 PyErr_NoMemory();
2213 return NULL;
2214 }
2215 }
2216 else {
2217 if (targetsize < targetlen) {
2218 PyErr_Format(PyExc_SystemError,
2219 "string is longer than the buffer");
2220 if (copy_null && 0 < targetsize)
2221 target[0] = 0;
2222 return NULL;
2223 }
2224 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002225 if (kind == PyUnicode_1BYTE_KIND) {
2226 Py_UCS1 *start = (Py_UCS1 *) data;
2227 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002229 else if (kind == PyUnicode_2BYTE_KIND) {
2230 Py_UCS2 *start = (Py_UCS2 *) data;
2231 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2232 }
2233 else {
2234 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002236 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 if (copy_null)
2238 target[len] = 0;
2239 return target;
2240}
2241
2242Py_UCS4*
2243PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2244 int copy_null)
2245{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002246 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 PyErr_BadInternalCall();
2248 return NULL;
2249 }
2250 return as_ucs4(string, target, targetsize, copy_null);
2251}
2252
2253Py_UCS4*
2254PyUnicode_AsUCS4Copy(PyObject *string)
2255{
2256 return as_ucs4(string, NULL, 0, 1);
2257}
2258
2259#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002260
Alexander Belopolsky40018472011-02-26 01:02:56 +00002261PyObject *
2262PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002264 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002266 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002267 PyErr_BadInternalCall();
2268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 }
2270
Martin v. Löwis790465f2008-04-05 20:41:37 +00002271 if (size == -1) {
2272 size = wcslen(w);
2273 }
2274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276}
2277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002279
Walter Dörwald346737f2007-05-31 10:44:43 +00002280static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002281makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2282 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002283{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002284 *fmt++ = '%';
2285 if (width) {
2286 if (zeropad)
2287 *fmt++ = '0';
2288 fmt += sprintf(fmt, "%d", width);
2289 }
2290 if (precision)
2291 fmt += sprintf(fmt, ".%d", precision);
2292 if (longflag)
2293 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002294 else if (longlongflag) {
2295 /* longlongflag should only ever be nonzero on machines with
2296 HAVE_LONG_LONG defined */
2297#ifdef HAVE_LONG_LONG
2298 char *f = PY_FORMAT_LONG_LONG;
2299 while (*f)
2300 *fmt++ = *f++;
2301#else
2302 /* we shouldn't ever get here */
2303 assert(0);
2304 *fmt++ = 'l';
2305#endif
2306 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002307 else if (size_tflag) {
2308 char *f = PY_FORMAT_SIZE_T;
2309 while (*f)
2310 *fmt++ = *f++;
2311 }
2312 *fmt++ = c;
2313 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002314}
2315
Victor Stinner96865452011-03-01 23:44:09 +00002316/* helper for PyUnicode_FromFormatV() */
2317
2318static const char*
2319parse_format_flags(const char *f,
2320 int *p_width, int *p_precision,
2321 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2322{
2323 int width, precision, longflag, longlongflag, size_tflag;
2324
2325 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2326 f++;
2327 width = 0;
2328 while (Py_ISDIGIT((unsigned)*f))
2329 width = (width*10) + *f++ - '0';
2330 precision = 0;
2331 if (*f == '.') {
2332 f++;
2333 while (Py_ISDIGIT((unsigned)*f))
2334 precision = (precision*10) + *f++ - '0';
2335 if (*f == '%') {
2336 /* "%.3%s" => f points to "3" */
2337 f--;
2338 }
2339 }
2340 if (*f == '\0') {
2341 /* bogus format "%.1" => go backward, f points to "1" */
2342 f--;
2343 }
2344 if (p_width != NULL)
2345 *p_width = width;
2346 if (p_precision != NULL)
2347 *p_precision = precision;
2348
2349 /* Handle %ld, %lu, %lld and %llu. */
2350 longflag = 0;
2351 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002352 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002353
2354 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002355 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002356 longflag = 1;
2357 ++f;
2358 }
2359#ifdef HAVE_LONG_LONG
2360 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002361 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002362 longlongflag = 1;
2363 f += 2;
2364 }
2365#endif
2366 }
2367 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002368 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002369 size_tflag = 1;
2370 ++f;
2371 }
2372 if (p_longflag != NULL)
2373 *p_longflag = longflag;
2374 if (p_longlongflag != NULL)
2375 *p_longlongflag = longlongflag;
2376 if (p_size_tflag != NULL)
2377 *p_size_tflag = size_tflag;
2378 return f;
2379}
2380
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002381/* maximum number of characters required for output of %ld. 21 characters
2382 allows for 64-bit integers (in decimal) and an optional sign. */
2383#define MAX_LONG_CHARS 21
2384/* maximum number of characters required for output of %lld.
2385 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2386 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2387#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2388
Walter Dörwaldd2034312007-05-18 16:29:38 +00002389PyObject *
2390PyUnicode_FromFormatV(const char *format, va_list vargs)
2391{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002392 va_list count;
2393 Py_ssize_t callcount = 0;
2394 PyObject **callresults = NULL;
2395 PyObject **callresult = NULL;
2396 Py_ssize_t n = 0;
2397 int width = 0;
2398 int precision = 0;
2399 int zeropad;
2400 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002401 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002402 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002403 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2405 Py_UCS4 argmaxchar;
2406 Py_ssize_t numbersize = 0;
2407 char *numberresults = NULL;
2408 char *numberresult = NULL;
2409 Py_ssize_t i;
2410 int kind;
2411 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002412
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002413 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002414 /* step 1: count the number of %S/%R/%A/%s format specifications
2415 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2416 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002418 * also estimate a upper bound for all the number formats in the string,
2419 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002420 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002421 for (f = format; *f; f++) {
2422 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002423 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2425 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2426 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2427 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002429 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002430#ifdef HAVE_LONG_LONG
2431 if (longlongflag) {
2432 if (width < MAX_LONG_LONG_CHARS)
2433 width = MAX_LONG_LONG_CHARS;
2434 }
2435 else
2436#endif
2437 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2438 including sign. Decimal takes the most space. This
2439 isn't enough for octal. If a width is specified we
2440 need more (which we allocate later). */
2441 if (width < MAX_LONG_CHARS)
2442 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443
2444 /* account for the size + '\0' to separate numbers
2445 inside of the numberresults buffer */
2446 numbersize += (width + 1);
2447 }
2448 }
2449 else if ((unsigned char)*f > 127) {
2450 PyErr_Format(PyExc_ValueError,
2451 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2452 "string, got a non-ASCII byte: 0x%02x",
2453 (unsigned char)*f);
2454 return NULL;
2455 }
2456 }
2457 /* step 2: allocate memory for the results of
2458 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2459 if (callcount) {
2460 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2461 if (!callresults) {
2462 PyErr_NoMemory();
2463 return NULL;
2464 }
2465 callresult = callresults;
2466 }
2467 /* step 2.5: allocate memory for the results of formating numbers */
2468 if (numbersize) {
2469 numberresults = PyObject_Malloc(numbersize);
2470 if (!numberresults) {
2471 PyErr_NoMemory();
2472 goto fail;
2473 }
2474 numberresult = numberresults;
2475 }
2476
2477 /* step 3: format numbers and figure out how large a buffer we need */
2478 for (f = format; *f; f++) {
2479 if (*f == '%') {
2480 const char* p;
2481 int longflag;
2482 int longlongflag;
2483 int size_tflag;
2484 int numprinted;
2485
2486 p = f;
2487 zeropad = (f[1] == '0');
2488 f = parse_format_flags(f, &width, &precision,
2489 &longflag, &longlongflag, &size_tflag);
2490 switch (*f) {
2491 case 'c':
2492 {
2493 Py_UCS4 ordinal = va_arg(count, int);
Victor Stinnere6abb482012-05-02 01:15:40 +02002494 maxchar = MAX_MAXCHAR(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 n++;
2496 break;
2497 }
2498 case '%':
2499 n++;
2500 break;
2501 case 'i':
2502 case 'd':
2503 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2504 width, precision, *f);
2505 if (longflag)
2506 numprinted = sprintf(numberresult, fmt,
2507 va_arg(count, long));
2508#ifdef HAVE_LONG_LONG
2509 else if (longlongflag)
2510 numprinted = sprintf(numberresult, fmt,
2511 va_arg(count, PY_LONG_LONG));
2512#endif
2513 else if (size_tflag)
2514 numprinted = sprintf(numberresult, fmt,
2515 va_arg(count, Py_ssize_t));
2516 else
2517 numprinted = sprintf(numberresult, fmt,
2518 va_arg(count, int));
2519 n += numprinted;
2520 /* advance by +1 to skip over the '\0' */
2521 numberresult += (numprinted + 1);
2522 assert(*(numberresult - 1) == '\0');
2523 assert(*(numberresult - 2) != '\0');
2524 assert(numprinted >= 0);
2525 assert(numberresult <= numberresults + numbersize);
2526 break;
2527 case 'u':
2528 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2529 width, precision, 'u');
2530 if (longflag)
2531 numprinted = sprintf(numberresult, fmt,
2532 va_arg(count, unsigned long));
2533#ifdef HAVE_LONG_LONG
2534 else if (longlongflag)
2535 numprinted = sprintf(numberresult, fmt,
2536 va_arg(count, unsigned PY_LONG_LONG));
2537#endif
2538 else if (size_tflag)
2539 numprinted = sprintf(numberresult, fmt,
2540 va_arg(count, size_t));
2541 else
2542 numprinted = sprintf(numberresult, fmt,
2543 va_arg(count, unsigned int));
2544 n += numprinted;
2545 numberresult += (numprinted + 1);
2546 assert(*(numberresult - 1) == '\0');
2547 assert(*(numberresult - 2) != '\0');
2548 assert(numprinted >= 0);
2549 assert(numberresult <= numberresults + numbersize);
2550 break;
2551 case 'x':
2552 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2553 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2554 n += numprinted;
2555 numberresult += (numprinted + 1);
2556 assert(*(numberresult - 1) == '\0');
2557 assert(*(numberresult - 2) != '\0');
2558 assert(numprinted >= 0);
2559 assert(numberresult <= numberresults + numbersize);
2560 break;
2561 case 'p':
2562 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2563 /* %p is ill-defined: ensure leading 0x. */
2564 if (numberresult[1] == 'X')
2565 numberresult[1] = 'x';
2566 else if (numberresult[1] != 'x') {
2567 memmove(numberresult + 2, numberresult,
2568 strlen(numberresult) + 1);
2569 numberresult[0] = '0';
2570 numberresult[1] = 'x';
2571 numprinted += 2;
2572 }
2573 n += numprinted;
2574 numberresult += (numprinted + 1);
2575 assert(*(numberresult - 1) == '\0');
2576 assert(*(numberresult - 2) != '\0');
2577 assert(numprinted >= 0);
2578 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002579 break;
2580 case 's':
2581 {
2582 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002583 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002584 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002585 if (!str)
2586 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002587 /* since PyUnicode_DecodeUTF8 returns already flexible
2588 unicode objects, there is no need to call ready on them */
2589 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002590 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002591 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002592 /* Remember the str and switch to the next slot */
2593 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002594 break;
2595 }
2596 case 'U':
2597 {
2598 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002599 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002600 if (PyUnicode_READY(obj) == -1)
2601 goto fail;
2602 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002603 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002605 break;
2606 }
2607 case 'V':
2608 {
2609 PyObject *obj = va_arg(count, PyObject *);
2610 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002611 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002612 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002613 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002614 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002615 if (PyUnicode_READY(obj) == -1)
2616 goto fail;
2617 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002618 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002620 *callresult++ = NULL;
2621 }
2622 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002623 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002624 if (!str_obj)
2625 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002626 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002627 Py_DECREF(str_obj);
2628 goto fail;
2629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002630 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002631 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002633 *callresult++ = str_obj;
2634 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002635 break;
2636 }
2637 case 'S':
2638 {
2639 PyObject *obj = va_arg(count, PyObject *);
2640 PyObject *str;
2641 assert(obj);
2642 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002643 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002644 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002645 if (PyUnicode_READY(str) == -1) {
2646 Py_DECREF(str);
2647 goto fail;
2648 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002649 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002650 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002651 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002652 /* Remember the str and switch to the next slot */
2653 *callresult++ = str;
2654 break;
2655 }
2656 case 'R':
2657 {
2658 PyObject *obj = va_arg(count, PyObject *);
2659 PyObject *repr;
2660 assert(obj);
2661 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002662 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002663 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002664 if (PyUnicode_READY(repr) == -1) {
2665 Py_DECREF(repr);
2666 goto fail;
2667 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002668 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Victor Stinnere6abb482012-05-02 01:15:40 +02002669 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002670 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002671 /* Remember the repr and switch to the next slot */
2672 *callresult++ = repr;
2673 break;
2674 }
2675 case 'A':
2676 {
2677 PyObject *obj = va_arg(count, PyObject *);
2678 PyObject *ascii;
2679 assert(obj);
2680 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002681 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002682 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002683 if (PyUnicode_READY(ascii) == -1) {
2684 Py_DECREF(ascii);
2685 goto fail;
2686 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002687 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Victor Stinnere6abb482012-05-02 01:15:40 +02002688 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002689 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002690 /* Remember the repr and switch to the next slot */
2691 *callresult++ = ascii;
2692 break;
2693 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 default:
2695 /* if we stumble upon an unknown
2696 formatting code, copy the rest of
2697 the format string to the output
2698 string. (we cannot just skip the
2699 code, since there's no way to know
2700 what's in the argument list) */
2701 n += strlen(p);
2702 goto expand;
2703 }
2704 } else
2705 n++;
2706 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002707 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002708 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002709 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 we don't have to resize the string.
2711 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002712 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002713 if (!string)
2714 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002715 kind = PyUnicode_KIND(string);
2716 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002717 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002720 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002721 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002722 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002723
2724 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002725 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2726 /* checking for == because the last argument could be a empty
2727 string, which causes i to point to end, the assert at the end of
2728 the loop */
2729 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002730
Benjamin Peterson14339b62009-01-31 16:36:08 +00002731 switch (*f) {
2732 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002733 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002734 const int ordinal = va_arg(vargs, int);
2735 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002736 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002737 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002738 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002739 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002740 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002741 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002742 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002743 {
Victor Stinner184252a2012-06-16 02:57:41 +02002744 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002745 /* unused, since we already have the result */
2746 if (*f == 'p')
2747 (void) va_arg(vargs, void *);
2748 else
2749 (void) va_arg(vargs, int);
2750 /* extract the result from numberresults and append. */
Victor Stinner184252a2012-06-16 02:57:41 +02002751 len = strlen(numberresult);
2752 unicode_write_cstr(string, i, numberresult, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002753 /* skip over the separating '\0' */
Victor Stinner184252a2012-06-16 02:57:41 +02002754 i += len;
2755 numberresult += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002756 assert(*numberresult == '\0');
2757 numberresult++;
2758 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002759 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002760 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002761 case 's':
2762 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002763 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002764 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002765 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002766 size = PyUnicode_GET_LENGTH(*callresult);
2767 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002768 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002769 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002770 /* We're done with the unicode()/repr() => forget it */
2771 Py_DECREF(*callresult);
2772 /* switch to next unicode()/repr() result */
2773 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002774 break;
2775 }
2776 case 'U':
2777 {
2778 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 Py_ssize_t size;
2780 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2781 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerd3f08822012-05-29 12:57:52 +02002782 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002783 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 break;
2785 }
2786 case 'V':
2787 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002788 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002789 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002790 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002791 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792 size = PyUnicode_GET_LENGTH(obj);
2793 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002794 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002795 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002796 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002797 size = PyUnicode_GET_LENGTH(*callresult);
2798 assert(PyUnicode_KIND(*callresult) <=
2799 PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002800 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002801 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002802 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002803 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002804 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002805 break;
2806 }
2807 case 'S':
2808 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002809 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002810 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002811 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002812 /* unused, since we already have the result */
2813 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002814 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002815 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002816 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002817 /* We're done with the unicode()/repr() => forget it */
2818 Py_DECREF(*callresult);
2819 /* switch to next unicode()/repr() result */
2820 ++callresult;
2821 break;
2822 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002823 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002824 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002825 break;
2826 default:
Victor Stinner184252a2012-06-16 02:57:41 +02002827 {
2828 Py_ssize_t len = strlen(p);
2829 unicode_write_cstr(string, i, p, len);
2830 i += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002831 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002832 goto end;
2833 }
Victor Stinner184252a2012-06-16 02:57:41 +02002834 }
Victor Stinner1205f272010-09-11 00:54:47 +00002835 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002836 else {
2837 assert(i < PyUnicode_GET_LENGTH(string));
2838 PyUnicode_WRITE(kind, data, i++, *f);
2839 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002841 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002842
Benjamin Peterson29060642009-01-31 22:14:21 +00002843 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002844 if (callresults)
2845 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002846 if (numberresults)
2847 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002848 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002849 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002850 if (callresults) {
2851 PyObject **callresult2 = callresults;
2852 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002853 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002854 ++callresult2;
2855 }
2856 PyObject_Free(callresults);
2857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002858 if (numberresults)
2859 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002860 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002861}
2862
Walter Dörwaldd2034312007-05-18 16:29:38 +00002863PyObject *
2864PyUnicode_FromFormat(const char *format, ...)
2865{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002866 PyObject* ret;
2867 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002868
2869#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002870 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002871#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002872 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002873#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002874 ret = PyUnicode_FromFormatV(format, vargs);
2875 va_end(vargs);
2876 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002877}
2878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002879#ifdef HAVE_WCHAR_H
2880
Victor Stinner5593d8a2010-10-02 11:11:27 +00002881/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2882 convert a Unicode object to a wide character string.
2883
Victor Stinnerd88d9832011-09-06 02:00:05 +02002884 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002885 character) required to convert the unicode object. Ignore size argument.
2886
Victor Stinnerd88d9832011-09-06 02:00:05 +02002887 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002888 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002889 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002890static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002891unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002892 wchar_t *w,
2893 Py_ssize_t size)
2894{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002895 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002896 const wchar_t *wstr;
2897
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002898 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002899 if (wstr == NULL)
2900 return -1;
2901
Victor Stinner5593d8a2010-10-02 11:11:27 +00002902 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002903 if (size > res)
2904 size = res + 1;
2905 else
2906 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002907 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002908 return res;
2909 }
2910 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002911 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002912}
2913
2914Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002915PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002916 wchar_t *w,
2917 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918{
2919 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002920 PyErr_BadInternalCall();
2921 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002923 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924}
2925
Victor Stinner137c34c2010-09-29 10:25:54 +00002926wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002927PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002928 Py_ssize_t *size)
2929{
2930 wchar_t* buffer;
2931 Py_ssize_t buflen;
2932
2933 if (unicode == NULL) {
2934 PyErr_BadInternalCall();
2935 return NULL;
2936 }
2937
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002938 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002939 if (buflen == -1)
2940 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002941 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002942 PyErr_NoMemory();
2943 return NULL;
2944 }
2945
Victor Stinner137c34c2010-09-29 10:25:54 +00002946 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2947 if (buffer == NULL) {
2948 PyErr_NoMemory();
2949 return NULL;
2950 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002951 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002952 if (buflen == -1) {
2953 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002954 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002955 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002956 if (size != NULL)
2957 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002958 return buffer;
2959}
2960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002961#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002962
Alexander Belopolsky40018472011-02-26 01:02:56 +00002963PyObject *
2964PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002966 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002967 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002968 PyErr_SetString(PyExc_ValueError,
2969 "chr() arg not in range(0x110000)");
2970 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002971 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002973 if (ordinal < 256)
2974 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002976 v = PyUnicode_New(1, ordinal);
2977 if (v == NULL)
2978 return NULL;
2979 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002980 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002981 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002982}
2983
Alexander Belopolsky40018472011-02-26 01:02:56 +00002984PyObject *
2985PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002987 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002988 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002989 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002990 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002991 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002992 Py_INCREF(obj);
2993 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002994 }
2995 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002996 /* For a Unicode subtype that's not a Unicode object,
2997 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002998 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002999 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003000 PyErr_Format(PyExc_TypeError,
3001 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003002 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003003 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003004}
3005
Alexander Belopolsky40018472011-02-26 01:02:56 +00003006PyObject *
3007PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003008 const char *encoding,
3009 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003010{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003011 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003012 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003013
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003015 PyErr_BadInternalCall();
3016 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003018
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003019 /* Decoding bytes objects is the most common case and should be fast */
3020 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003021 if (PyBytes_GET_SIZE(obj) == 0)
3022 _Py_RETURN_UNICODE_EMPTY();
3023 v = PyUnicode_Decode(
3024 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3025 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003026 return v;
3027 }
3028
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003029 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 PyErr_SetString(PyExc_TypeError,
3031 "decoding str is not supported");
3032 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003033 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003034
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003035 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3036 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3037 PyErr_Format(PyExc_TypeError,
3038 "coercing to str: need bytes, bytearray "
3039 "or buffer-like object, %.80s found",
3040 Py_TYPE(obj)->tp_name);
3041 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003042 }
Tim Petersced69f82003-09-16 20:30:58 +00003043
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003044 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003045 PyBuffer_Release(&buffer);
3046 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003048
Serhiy Storchaka05997252013-01-26 12:14:02 +02003049 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003050 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003051 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052}
3053
Victor Stinner600d3be2010-06-10 12:00:55 +00003054/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003055 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3056 1 on success. */
Victor Stinner20b654a2013-01-03 01:08:58 +01003057int
3058_Py_normalize_encoding(const char *encoding,
Victor Stinner37296e82010-06-10 13:36:23 +00003059 char *lower,
3060 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003062 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003063 char *l;
3064 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003065
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003066 if (encoding == NULL) {
3067 strcpy(lower, "utf-8");
3068 return 1;
3069 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003070 e = encoding;
3071 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003072 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003073 while (*e) {
3074 if (l == l_end)
3075 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003076 if (Py_ISUPPER(*e)) {
3077 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003078 }
3079 else if (*e == '_') {
3080 *l++ = '-';
3081 e++;
3082 }
3083 else {
3084 *l++ = *e++;
3085 }
3086 }
3087 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003088 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003089}
3090
Alexander Belopolsky40018472011-02-26 01:02:56 +00003091PyObject *
3092PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003093 Py_ssize_t size,
3094 const char *encoding,
3095 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003096{
3097 PyObject *buffer = NULL, *unicode;
3098 Py_buffer info;
3099 char lower[11]; /* Enough for any encoding shortcut */
3100
Fred Drakee4315f52000-05-09 19:53:39 +00003101 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01003102 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003103 if ((strcmp(lower, "utf-8") == 0) ||
3104 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003105 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003106 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003107 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003108 (strcmp(lower, "iso-8859-1") == 0))
3109 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003110#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003111 else if (strcmp(lower, "mbcs") == 0)
3112 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003113#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003114 else if (strcmp(lower, "ascii") == 0)
3115 return PyUnicode_DecodeASCII(s, size, errors);
3116 else if (strcmp(lower, "utf-16") == 0)
3117 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3118 else if (strcmp(lower, "utf-32") == 0)
3119 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3120 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121
3122 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003123 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003124 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003125 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003126 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 if (buffer == NULL)
3128 goto onError;
3129 unicode = PyCodec_Decode(buffer, encoding, errors);
3130 if (unicode == NULL)
3131 goto onError;
3132 if (!PyUnicode_Check(unicode)) {
3133 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003134 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003135 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136 Py_DECREF(unicode);
3137 goto onError;
3138 }
3139 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003140 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003141
Benjamin Peterson29060642009-01-31 22:14:21 +00003142 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 Py_XDECREF(buffer);
3144 return NULL;
3145}
3146
Alexander Belopolsky40018472011-02-26 01:02:56 +00003147PyObject *
3148PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003149 const char *encoding,
3150 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003151{
3152 PyObject *v;
3153
3154 if (!PyUnicode_Check(unicode)) {
3155 PyErr_BadArgument();
3156 goto onError;
3157 }
3158
3159 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003160 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003161
3162 /* Decode via the codec registry */
3163 v = PyCodec_Decode(unicode, encoding, errors);
3164 if (v == NULL)
3165 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003166 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003167
Benjamin Peterson29060642009-01-31 22:14:21 +00003168 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003169 return NULL;
3170}
3171
Alexander Belopolsky40018472011-02-26 01:02:56 +00003172PyObject *
3173PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003174 const char *encoding,
3175 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003176{
3177 PyObject *v;
3178
3179 if (!PyUnicode_Check(unicode)) {
3180 PyErr_BadArgument();
3181 goto onError;
3182 }
3183
3184 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003185 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003186
3187 /* Decode via the codec registry */
3188 v = PyCodec_Decode(unicode, encoding, errors);
3189 if (v == NULL)
3190 goto onError;
3191 if (!PyUnicode_Check(v)) {
3192 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003193 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003194 Py_TYPE(v)->tp_name);
3195 Py_DECREF(v);
3196 goto onError;
3197 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003198 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003199
Benjamin Peterson29060642009-01-31 22:14:21 +00003200 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003201 return NULL;
3202}
3203
Alexander Belopolsky40018472011-02-26 01:02:56 +00003204PyObject *
3205PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003206 Py_ssize_t size,
3207 const char *encoding,
3208 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209{
3210 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003211
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 unicode = PyUnicode_FromUnicode(s, size);
3213 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003214 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3216 Py_DECREF(unicode);
3217 return v;
3218}
3219
Alexander Belopolsky40018472011-02-26 01:02:56 +00003220PyObject *
3221PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003222 const char *encoding,
3223 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003224{
3225 PyObject *v;
3226
3227 if (!PyUnicode_Check(unicode)) {
3228 PyErr_BadArgument();
3229 goto onError;
3230 }
3231
3232 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003234
3235 /* Encode via the codec registry */
3236 v = PyCodec_Encode(unicode, encoding, errors);
3237 if (v == NULL)
3238 goto onError;
3239 return v;
3240
Benjamin Peterson29060642009-01-31 22:14:21 +00003241 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003242 return NULL;
3243}
3244
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003245static size_t
3246wcstombs_errorpos(const wchar_t *wstr)
3247{
3248 size_t len;
3249#if SIZEOF_WCHAR_T == 2
3250 wchar_t buf[3];
3251#else
3252 wchar_t buf[2];
3253#endif
3254 char outbuf[MB_LEN_MAX];
3255 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003256
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003257#if SIZEOF_WCHAR_T == 2
3258 buf[2] = 0;
3259#else
3260 buf[1] = 0;
3261#endif
3262 start = wstr;
3263 while (*wstr != L'\0')
3264 {
3265 previous = wstr;
3266#if SIZEOF_WCHAR_T == 2
3267 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3268 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3269 {
3270 buf[0] = wstr[0];
3271 buf[1] = wstr[1];
3272 wstr += 2;
3273 }
3274 else {
3275 buf[0] = *wstr;
3276 buf[1] = 0;
3277 wstr++;
3278 }
3279#else
3280 buf[0] = *wstr;
3281 wstr++;
3282#endif
3283 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003284 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003285 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003286 }
3287
3288 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003289 return 0;
3290}
3291
Victor Stinner1b579672011-12-17 05:47:23 +01003292static int
3293locale_error_handler(const char *errors, int *surrogateescape)
3294{
3295 if (errors == NULL) {
3296 *surrogateescape = 0;
3297 return 0;
3298 }
3299
3300 if (strcmp(errors, "strict") == 0) {
3301 *surrogateescape = 0;
3302 return 0;
3303 }
3304 if (strcmp(errors, "surrogateescape") == 0) {
3305 *surrogateescape = 1;
3306 return 0;
3307 }
3308 PyErr_Format(PyExc_ValueError,
3309 "only 'strict' and 'surrogateescape' error handlers "
3310 "are supported, not '%s'",
3311 errors);
3312 return -1;
3313}
3314
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003315PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003316PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003317{
3318 Py_ssize_t wlen, wlen2;
3319 wchar_t *wstr;
3320 PyObject *bytes = NULL;
3321 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003322 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003323 PyObject *exc;
3324 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003325 int surrogateescape;
3326
3327 if (locale_error_handler(errors, &surrogateescape) < 0)
3328 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003329
3330 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3331 if (wstr == NULL)
3332 return NULL;
3333
3334 wlen2 = wcslen(wstr);
3335 if (wlen2 != wlen) {
3336 PyMem_Free(wstr);
3337 PyErr_SetString(PyExc_TypeError, "embedded null character");
3338 return NULL;
3339 }
3340
3341 if (surrogateescape) {
3342 /* locale encoding with surrogateescape */
3343 char *str;
3344
3345 str = _Py_wchar2char(wstr, &error_pos);
3346 if (str == NULL) {
3347 if (error_pos == (size_t)-1) {
3348 PyErr_NoMemory();
3349 PyMem_Free(wstr);
3350 return NULL;
3351 }
3352 else {
3353 goto encode_error;
3354 }
3355 }
3356 PyMem_Free(wstr);
3357
3358 bytes = PyBytes_FromString(str);
3359 PyMem_Free(str);
3360 }
3361 else {
3362 size_t len, len2;
3363
3364 len = wcstombs(NULL, wstr, 0);
3365 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003366 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003367 goto encode_error;
3368 }
3369
3370 bytes = PyBytes_FromStringAndSize(NULL, len);
3371 if (bytes == NULL) {
3372 PyMem_Free(wstr);
3373 return NULL;
3374 }
3375
3376 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3377 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003378 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003379 goto encode_error;
3380 }
3381 PyMem_Free(wstr);
3382 }
3383 return bytes;
3384
3385encode_error:
3386 errmsg = strerror(errno);
3387 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003388
3389 if (error_pos == (size_t)-1)
3390 error_pos = wcstombs_errorpos(wstr);
3391
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003392 PyMem_Free(wstr);
3393 Py_XDECREF(bytes);
3394
Victor Stinner2f197072011-12-17 07:08:30 +01003395 if (errmsg != NULL) {
3396 size_t errlen;
3397 wstr = _Py_char2wchar(errmsg, &errlen);
3398 if (wstr != NULL) {
3399 reason = PyUnicode_FromWideChar(wstr, errlen);
3400 PyMem_Free(wstr);
3401 } else
3402 errmsg = NULL;
3403 }
3404 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003405 reason = PyUnicode_FromString(
3406 "wcstombs() encountered an unencodable "
3407 "wide character");
3408 if (reason == NULL)
3409 return NULL;
3410
3411 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3412 "locale", unicode,
3413 (Py_ssize_t)error_pos,
3414 (Py_ssize_t)(error_pos+1),
3415 reason);
3416 Py_DECREF(reason);
3417 if (exc != NULL) {
3418 PyCodec_StrictErrors(exc);
3419 Py_XDECREF(exc);
3420 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003421 return NULL;
3422}
3423
Victor Stinnerad158722010-10-27 00:25:46 +00003424PyObject *
3425PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003426{
Victor Stinner99b95382011-07-04 14:23:54 +02003427#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003428 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003429#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003430 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003431#else
Victor Stinner793b5312011-04-27 00:24:21 +02003432 PyInterpreterState *interp = PyThreadState_GET()->interp;
3433 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3434 cannot use it to encode and decode filenames before it is loaded. Load
3435 the Python codec requires to encode at least its own filename. Use the C
3436 version of the locale codec until the codec registry is initialized and
3437 the Python codec is loaded.
3438
3439 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3440 cannot only rely on it: check also interp->fscodec_initialized for
3441 subinterpreters. */
3442 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003443 return PyUnicode_AsEncodedString(unicode,
3444 Py_FileSystemDefaultEncoding,
3445 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003446 }
3447 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003448 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003449 }
Victor Stinnerad158722010-10-27 00:25:46 +00003450#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003451}
3452
Alexander Belopolsky40018472011-02-26 01:02:56 +00003453PyObject *
3454PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003455 const char *encoding,
3456 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457{
3458 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003459 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003460
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461 if (!PyUnicode_Check(unicode)) {
3462 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003463 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 }
Fred Drakee4315f52000-05-09 19:53:39 +00003465
Fred Drakee4315f52000-05-09 19:53:39 +00003466 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01003467 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003468 if ((strcmp(lower, "utf-8") == 0) ||
3469 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003470 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003471 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003472 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003473 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003474 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003475 }
Victor Stinner37296e82010-06-10 13:36:23 +00003476 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003477 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003478 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003479 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003480#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003481 else if (strcmp(lower, "mbcs") == 0)
3482 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003483#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003484 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003485 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003486 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003487
3488 /* Encode via the codec registry */
3489 v = PyCodec_Encode(unicode, encoding, errors);
3490 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003491 return NULL;
3492
3493 /* The normal path */
3494 if (PyBytes_Check(v))
3495 return v;
3496
3497 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003498 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003499 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003500 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003501
3502 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3503 "encoder %s returned bytearray instead of bytes",
3504 encoding);
3505 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003506 Py_DECREF(v);
3507 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003508 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003509
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003510 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3511 Py_DECREF(v);
3512 return b;
3513 }
3514
3515 PyErr_Format(PyExc_TypeError,
3516 "encoder did not return a bytes object (type=%.400s)",
3517 Py_TYPE(v)->tp_name);
3518 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003519 return NULL;
3520}
3521
Alexander Belopolsky40018472011-02-26 01:02:56 +00003522PyObject *
3523PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003524 const char *encoding,
3525 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003526{
3527 PyObject *v;
3528
3529 if (!PyUnicode_Check(unicode)) {
3530 PyErr_BadArgument();
3531 goto onError;
3532 }
3533
3534 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003535 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003536
3537 /* Encode via the codec registry */
3538 v = PyCodec_Encode(unicode, encoding, errors);
3539 if (v == NULL)
3540 goto onError;
3541 if (!PyUnicode_Check(v)) {
3542 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003543 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003544 Py_TYPE(v)->tp_name);
3545 Py_DECREF(v);
3546 goto onError;
3547 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003549
Benjamin Peterson29060642009-01-31 22:14:21 +00003550 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551 return NULL;
3552}
3553
Victor Stinner2f197072011-12-17 07:08:30 +01003554static size_t
3555mbstowcs_errorpos(const char *str, size_t len)
3556{
3557#ifdef HAVE_MBRTOWC
3558 const char *start = str;
3559 mbstate_t mbs;
3560 size_t converted;
3561 wchar_t ch;
3562
3563 memset(&mbs, 0, sizeof mbs);
3564 while (len)
3565 {
3566 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3567 if (converted == 0)
3568 /* Reached end of string */
3569 break;
3570 if (converted == (size_t)-1 || converted == (size_t)-2) {
3571 /* Conversion error or incomplete character */
3572 return str - start;
3573 }
3574 else {
3575 str += converted;
3576 len -= converted;
3577 }
3578 }
3579 /* failed to find the undecodable byte sequence */
3580 return 0;
3581#endif
3582 return 0;
3583}
3584
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003585PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003586PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003587 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003588{
3589 wchar_t smallbuf[256];
3590 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3591 wchar_t *wstr;
3592 size_t wlen, wlen2;
3593 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003594 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003595 size_t error_pos;
3596 char *errmsg;
3597 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003598
3599 if (locale_error_handler(errors, &surrogateescape) < 0)
3600 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003601
3602 if (str[len] != '\0' || len != strlen(str)) {
3603 PyErr_SetString(PyExc_TypeError, "embedded null character");
3604 return NULL;
3605 }
3606
3607 if (surrogateescape)
3608 {
3609 wstr = _Py_char2wchar(str, &wlen);
3610 if (wstr == NULL) {
3611 if (wlen == (size_t)-1)
3612 PyErr_NoMemory();
3613 else
3614 PyErr_SetFromErrno(PyExc_OSError);
3615 return NULL;
3616 }
3617
3618 unicode = PyUnicode_FromWideChar(wstr, wlen);
3619 PyMem_Free(wstr);
3620 }
3621 else {
3622#ifndef HAVE_BROKEN_MBSTOWCS
3623 wlen = mbstowcs(NULL, str, 0);
3624#else
3625 wlen = len;
3626#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003627 if (wlen == (size_t)-1)
3628 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003629 if (wlen+1 <= smallbuf_len) {
3630 wstr = smallbuf;
3631 }
3632 else {
3633 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3634 return PyErr_NoMemory();
3635
3636 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3637 if (!wstr)
3638 return PyErr_NoMemory();
3639 }
3640
3641 /* This shouldn't fail now */
3642 wlen2 = mbstowcs(wstr, str, wlen+1);
3643 if (wlen2 == (size_t)-1) {
3644 if (wstr != smallbuf)
3645 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003646 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003647 }
3648#ifdef HAVE_BROKEN_MBSTOWCS
3649 assert(wlen2 == wlen);
3650#endif
3651 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3652 if (wstr != smallbuf)
3653 PyMem_Free(wstr);
3654 }
3655 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003656
3657decode_error:
3658 errmsg = strerror(errno);
3659 assert(errmsg != NULL);
3660
3661 error_pos = mbstowcs_errorpos(str, len);
3662 if (errmsg != NULL) {
3663 size_t errlen;
3664 wstr = _Py_char2wchar(errmsg, &errlen);
3665 if (wstr != NULL) {
3666 reason = PyUnicode_FromWideChar(wstr, errlen);
3667 PyMem_Free(wstr);
3668 } else
3669 errmsg = NULL;
3670 }
3671 if (errmsg == NULL)
3672 reason = PyUnicode_FromString(
3673 "mbstowcs() encountered an invalid multibyte sequence");
3674 if (reason == NULL)
3675 return NULL;
3676
3677 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3678 "locale", str, len,
3679 (Py_ssize_t)error_pos,
3680 (Py_ssize_t)(error_pos+1),
3681 reason);
3682 Py_DECREF(reason);
3683 if (exc != NULL) {
3684 PyCodec_StrictErrors(exc);
3685 Py_XDECREF(exc);
3686 }
3687 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003688}
3689
3690PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003691PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003692{
3693 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003694 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003695}
3696
3697
3698PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003699PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003700 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003701 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3702}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003703
Christian Heimes5894ba72007-11-04 11:43:14 +00003704PyObject*
3705PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3706{
Victor Stinner99b95382011-07-04 14:23:54 +02003707#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003708 return PyUnicode_DecodeMBCS(s, size, NULL);
3709#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003710 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003711#else
Victor Stinner793b5312011-04-27 00:24:21 +02003712 PyInterpreterState *interp = PyThreadState_GET()->interp;
3713 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3714 cannot use it to encode and decode filenames before it is loaded. Load
3715 the Python codec requires to encode at least its own filename. Use the C
3716 version of the locale codec until the codec registry is initialized and
3717 the Python codec is loaded.
3718
3719 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3720 cannot only rely on it: check also interp->fscodec_initialized for
3721 subinterpreters. */
3722 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003723 return PyUnicode_Decode(s, size,
3724 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003725 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003726 }
3727 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003728 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003729 }
Victor Stinnerad158722010-10-27 00:25:46 +00003730#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003731}
3732
Martin v. Löwis011e8422009-05-05 04:43:17 +00003733
3734int
Antoine Pitrou13348842012-01-29 18:36:34 +01003735_PyUnicode_HasNULChars(PyObject* s)
3736{
3737 static PyObject *nul = NULL;
3738
3739 if (nul == NULL)
3740 nul = PyUnicode_FromStringAndSize("\0", 1);
3741 if (nul == NULL)
3742 return -1;
3743 return PyUnicode_Contains(s, nul);
3744}
3745
3746
3747int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003748PyUnicode_FSConverter(PyObject* arg, void* addr)
3749{
3750 PyObject *output = NULL;
3751 Py_ssize_t size;
3752 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003753 if (arg == NULL) {
3754 Py_DECREF(*(PyObject**)addr);
3755 return 1;
3756 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003757 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003758 output = arg;
3759 Py_INCREF(output);
3760 }
3761 else {
3762 arg = PyUnicode_FromObject(arg);
3763 if (!arg)
3764 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003765 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003766 Py_DECREF(arg);
3767 if (!output)
3768 return 0;
3769 if (!PyBytes_Check(output)) {
3770 Py_DECREF(output);
3771 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3772 return 0;
3773 }
3774 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003775 size = PyBytes_GET_SIZE(output);
3776 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003777 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003778 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003779 Py_DECREF(output);
3780 return 0;
3781 }
3782 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003783 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003784}
3785
3786
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003787int
3788PyUnicode_FSDecoder(PyObject* arg, void* addr)
3789{
3790 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003791 if (arg == NULL) {
3792 Py_DECREF(*(PyObject**)addr);
3793 return 1;
3794 }
3795 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003796 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003797 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003798 output = arg;
3799 Py_INCREF(output);
3800 }
3801 else {
3802 arg = PyBytes_FromObject(arg);
3803 if (!arg)
3804 return 0;
3805 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3806 PyBytes_GET_SIZE(arg));
3807 Py_DECREF(arg);
3808 if (!output)
3809 return 0;
3810 if (!PyUnicode_Check(output)) {
3811 Py_DECREF(output);
3812 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3813 return 0;
3814 }
3815 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003816 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003817 Py_DECREF(output);
3818 return 0;
3819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003820 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003821 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003822 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3823 Py_DECREF(output);
3824 return 0;
3825 }
3826 *(PyObject**)addr = output;
3827 return Py_CLEANUP_SUPPORTED;
3828}
3829
3830
Martin v. Löwis5b222132007-06-10 09:51:05 +00003831char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003832PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003833{
Christian Heimesf3863112007-11-22 07:46:41 +00003834 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003836 if (!PyUnicode_Check(unicode)) {
3837 PyErr_BadArgument();
3838 return NULL;
3839 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003840 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003841 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003842
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003843 if (PyUnicode_UTF8(unicode) == NULL) {
3844 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3846 if (bytes == NULL)
3847 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003848 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3849 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003850 Py_DECREF(bytes);
3851 return NULL;
3852 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003853 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3854 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3855 PyBytes_AS_STRING(bytes),
3856 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857 Py_DECREF(bytes);
3858 }
3859
3860 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003861 *psize = PyUnicode_UTF8_LENGTH(unicode);
3862 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003863}
3864
3865char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003867{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003868 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3869}
3870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871Py_UNICODE *
3872PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3873{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874 const unsigned char *one_byte;
3875#if SIZEOF_WCHAR_T == 4
3876 const Py_UCS2 *two_bytes;
3877#else
3878 const Py_UCS4 *four_bytes;
3879 const Py_UCS4 *ucs4_end;
3880 Py_ssize_t num_surrogates;
3881#endif
3882 wchar_t *w;
3883 wchar_t *wchar_end;
3884
3885 if (!PyUnicode_Check(unicode)) {
3886 PyErr_BadArgument();
3887 return NULL;
3888 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003889 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003891 assert(_PyUnicode_KIND(unicode) != 0);
3892 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003894 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003896 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3897 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003898 num_surrogates = 0;
3899
3900 for (; four_bytes < ucs4_end; ++four_bytes) {
3901 if (*four_bytes > 0xFFFF)
3902 ++num_surrogates;
3903 }
3904
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003905 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3906 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3907 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908 PyErr_NoMemory();
3909 return NULL;
3910 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003911 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003912
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003913 w = _PyUnicode_WSTR(unicode);
3914 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3915 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003916 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3917 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003918 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003920 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3921 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922 }
3923 else
3924 *w = *four_bytes;
3925
3926 if (w > wchar_end) {
3927 assert(0 && "Miscalculated string end");
3928 }
3929 }
3930 *w = 0;
3931#else
3932 /* sizeof(wchar_t) == 4 */
3933 Py_FatalError("Impossible unicode object state, wstr and str "
3934 "should share memory already.");
3935 return NULL;
3936#endif
3937 }
3938 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003939 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3940 (_PyUnicode_LENGTH(unicode) + 1));
3941 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003942 PyErr_NoMemory();
3943 return NULL;
3944 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003945 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3946 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3947 w = _PyUnicode_WSTR(unicode);
3948 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003950 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3951 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952 for (; w < wchar_end; ++one_byte, ++w)
3953 *w = *one_byte;
3954 /* null-terminate the wstr */
3955 *w = 0;
3956 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003957 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003958#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003959 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960 for (; w < wchar_end; ++two_bytes, ++w)
3961 *w = *two_bytes;
3962 /* null-terminate the wstr */
3963 *w = 0;
3964#else
3965 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003966 PyObject_FREE(_PyUnicode_WSTR(unicode));
3967 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003968 Py_FatalError("Impossible unicode object state, wstr "
3969 "and str should share memory already.");
3970 return NULL;
3971#endif
3972 }
3973 else {
3974 assert(0 && "This should never happen.");
3975 }
3976 }
3977 }
3978 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003979 *size = PyUnicode_WSTR_LENGTH(unicode);
3980 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003981}
3982
Alexander Belopolsky40018472011-02-26 01:02:56 +00003983Py_UNICODE *
3984PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003986 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987}
3988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003989
Alexander Belopolsky40018472011-02-26 01:02:56 +00003990Py_ssize_t
3991PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992{
3993 if (!PyUnicode_Check(unicode)) {
3994 PyErr_BadArgument();
3995 goto onError;
3996 }
3997 return PyUnicode_GET_SIZE(unicode);
3998
Benjamin Peterson29060642009-01-31 22:14:21 +00003999 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 return -1;
4001}
4002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003Py_ssize_t
4004PyUnicode_GetLength(PyObject *unicode)
4005{
Victor Stinner07621332012-06-16 04:53:46 +02004006 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 PyErr_BadArgument();
4008 return -1;
4009 }
Victor Stinner07621332012-06-16 04:53:46 +02004010 if (PyUnicode_READY(unicode) == -1)
4011 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012 return PyUnicode_GET_LENGTH(unicode);
4013}
4014
4015Py_UCS4
4016PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4017{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004018 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4019 PyErr_BadArgument();
4020 return (Py_UCS4)-1;
4021 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004022 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004023 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004024 return (Py_UCS4)-1;
4025 }
4026 return PyUnicode_READ_CHAR(unicode, index);
4027}
4028
4029int
4030PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4031{
4032 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004033 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004034 return -1;
4035 }
Victor Stinner488fa492011-12-12 00:01:39 +01004036 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004037 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004038 PyErr_SetString(PyExc_IndexError, "string index out of range");
4039 return -1;
4040 }
Victor Stinner488fa492011-12-12 00:01:39 +01004041 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004042 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004043 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4044 PyErr_SetString(PyExc_ValueError, "character out of range");
4045 return -1;
4046 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004047 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4048 index, ch);
4049 return 0;
4050}
4051
Alexander Belopolsky40018472011-02-26 01:02:56 +00004052const char *
4053PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004054{
Victor Stinner42cb4622010-09-01 19:39:01 +00004055 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004056}
4057
Victor Stinner554f3f02010-06-16 23:33:54 +00004058/* create or adjust a UnicodeDecodeError */
4059static void
4060make_decode_exception(PyObject **exceptionObject,
4061 const char *encoding,
4062 const char *input, Py_ssize_t length,
4063 Py_ssize_t startpos, Py_ssize_t endpos,
4064 const char *reason)
4065{
4066 if (*exceptionObject == NULL) {
4067 *exceptionObject = PyUnicodeDecodeError_Create(
4068 encoding, input, length, startpos, endpos, reason);
4069 }
4070 else {
4071 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4072 goto onError;
4073 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4074 goto onError;
4075 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4076 goto onError;
4077 }
4078 return;
4079
4080onError:
4081 Py_DECREF(*exceptionObject);
4082 *exceptionObject = NULL;
4083}
4084
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004085/* error handling callback helper:
4086 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004087 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004088 and adjust various state variables.
4089 return 0 on success, -1 on error
4090*/
4091
Alexander Belopolsky40018472011-02-26 01:02:56 +00004092static int
4093unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004094 const char *encoding, const char *reason,
4095 const char **input, const char **inend, Py_ssize_t *startinpos,
4096 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004097 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004098{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004099 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004100
4101 PyObject *restuple = NULL;
4102 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004103 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004104 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004105 Py_ssize_t requiredsize;
4106 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004107 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004108 int res = -1;
4109
Victor Stinner596a6c42011-11-09 00:02:18 +01004110 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4111 outsize = PyUnicode_GET_LENGTH(*output);
4112 else
4113 outsize = _PyUnicode_WSTR_LENGTH(*output);
4114
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004115 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 *errorHandler = PyCodec_LookupError(errors);
4117 if (*errorHandler == NULL)
4118 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 }
4120
Victor Stinner554f3f02010-06-16 23:33:54 +00004121 make_decode_exception(exceptionObject,
4122 encoding,
4123 *input, *inend - *input,
4124 *startinpos, *endinpos,
4125 reason);
4126 if (*exceptionObject == NULL)
4127 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128
4129 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4130 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004131 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004133 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004134 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135 }
4136 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004137 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004138 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004139 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004140
4141 /* Copy back the bytes variables, which might have been modified by the
4142 callback */
4143 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4144 if (!inputobj)
4145 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004146 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004148 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004149 *input = PyBytes_AS_STRING(inputobj);
4150 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004151 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004152 /* we can DECREF safely, as the exception has another reference,
4153 so the object won't go away. */
4154 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004155
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004157 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004158 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004159 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4160 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004161 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162
Victor Stinner596a6c42011-11-09 00:02:18 +01004163 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4164 /* need more space? (at least enough for what we
4165 have+the replacement+the rest of the string (starting
4166 at the new input position), so we won't have to check space
4167 when there are no errors in the rest of the string) */
4168 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4169 requiredsize = *outpos + replen + insize-newpos;
4170 if (requiredsize > outsize) {
4171 if (requiredsize<2*outsize)
4172 requiredsize = 2*outsize;
4173 if (unicode_resize(output, requiredsize) < 0)
4174 goto onError;
4175 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004176 if (unicode_widen(output, *outpos,
4177 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004179 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004180 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004181 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004182 else {
4183 wchar_t *repwstr;
4184 Py_ssize_t repwlen;
4185 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4186 if (repwstr == NULL)
4187 goto onError;
4188 /* need more space? (at least enough for what we
4189 have+the replacement+the rest of the string (starting
4190 at the new input position), so we won't have to check space
4191 when there are no errors in the rest of the string) */
4192 requiredsize = *outpos + repwlen + insize-newpos;
4193 if (requiredsize > outsize) {
4194 if (requiredsize < 2*outsize)
4195 requiredsize = 2*outsize;
4196 if (unicode_resize(output, requiredsize) < 0)
4197 goto onError;
4198 }
4199 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4200 *outpos += repwlen;
4201 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004203 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004204
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205 /* we made it! */
4206 res = 0;
4207
Benjamin Peterson29060642009-01-31 22:14:21 +00004208 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004209 Py_XDECREF(restuple);
4210 return res;
4211}
4212
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004213/* --- UTF-7 Codec -------------------------------------------------------- */
4214
Antoine Pitrou244651a2009-05-04 18:56:13 +00004215/* See RFC2152 for details. We encode conservatively and decode liberally. */
4216
4217/* Three simple macros defining base-64. */
4218
4219/* Is c a base-64 character? */
4220
4221#define IS_BASE64(c) \
4222 (((c) >= 'A' && (c) <= 'Z') || \
4223 ((c) >= 'a' && (c) <= 'z') || \
4224 ((c) >= '0' && (c) <= '9') || \
4225 (c) == '+' || (c) == '/')
4226
4227/* given that c is a base-64 character, what is its base-64 value? */
4228
4229#define FROM_BASE64(c) \
4230 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4231 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4232 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4233 (c) == '+' ? 62 : 63)
4234
4235/* What is the base-64 character of the bottom 6 bits of n? */
4236
4237#define TO_BASE64(n) \
4238 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4239
4240/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4241 * decoded as itself. We are permissive on decoding; the only ASCII
4242 * byte not decoding to itself is the + which begins a base64
4243 * string. */
4244
4245#define DECODE_DIRECT(c) \
4246 ((c) <= 127 && (c) != '+')
4247
4248/* The UTF-7 encoder treats ASCII characters differently according to
4249 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4250 * the above). See RFC2152. This array identifies these different
4251 * sets:
4252 * 0 : "Set D"
4253 * alphanumeric and '(),-./:?
4254 * 1 : "Set O"
4255 * !"#$%&*;<=>@[]^_`{|}
4256 * 2 : "whitespace"
4257 * ht nl cr sp
4258 * 3 : special (must be base64 encoded)
4259 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4260 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004261
Tim Petersced69f82003-09-16 20:30:58 +00004262static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004263char utf7_category[128] = {
4264/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4265 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4266/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4267 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4268/* sp ! " # $ % & ' ( ) * + , - . / */
4269 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4270/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4271 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4272/* @ A B C D E F G H I J K L M N O */
4273 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4274/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4275 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4276/* ` a b c d e f g h i j k l m n o */
4277 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4278/* p q r s t u v w x y z { | } ~ del */
4279 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004280};
4281
Antoine Pitrou244651a2009-05-04 18:56:13 +00004282/* ENCODE_DIRECT: this character should be encoded as itself. The
4283 * answer depends on whether we are encoding set O as itself, and also
4284 * on whether we are encoding whitespace as itself. RFC2152 makes it
4285 * clear that the answers to these questions vary between
4286 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004287
Antoine Pitrou244651a2009-05-04 18:56:13 +00004288#define ENCODE_DIRECT(c, directO, directWS) \
4289 ((c) < 128 && (c) > 0 && \
4290 ((utf7_category[(c)] == 0) || \
4291 (directWS && (utf7_category[(c)] == 2)) || \
4292 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004293
Alexander Belopolsky40018472011-02-26 01:02:56 +00004294PyObject *
4295PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004296 Py_ssize_t size,
4297 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004298{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004299 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4300}
4301
Antoine Pitrou244651a2009-05-04 18:56:13 +00004302/* The decoder. The only state we preserve is our read position,
4303 * i.e. how many characters we have consumed. So if we end in the
4304 * middle of a shift sequence we have to back off the read position
4305 * and the output to the beginning of the sequence, otherwise we lose
4306 * all the shift state (seen bits, number of bits seen, high
4307 * surrogate). */
4308
Alexander Belopolsky40018472011-02-26 01:02:56 +00004309PyObject *
4310PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004311 Py_ssize_t size,
4312 const char *errors,
4313 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004314{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004315 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004316 Py_ssize_t startinpos;
4317 Py_ssize_t endinpos;
4318 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004319 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004320 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004321 const char *errmsg = "";
4322 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004323 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004324 unsigned int base64bits = 0;
4325 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004326 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327 PyObject *errorHandler = NULL;
4328 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004329
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004330 /* Start off assuming it's all ASCII. Widen later as necessary. */
4331 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004332 if (!unicode)
4333 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004334 if (size == 0) {
4335 if (consumed)
4336 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004337 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004338 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004339
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004340 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004341 e = s + size;
4342
4343 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004344 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004345 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004346 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004347
Antoine Pitrou244651a2009-05-04 18:56:13 +00004348 if (inShift) { /* in a base-64 section */
4349 if (IS_BASE64(ch)) { /* consume a base-64 character */
4350 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4351 base64bits += 6;
4352 s++;
4353 if (base64bits >= 16) {
4354 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004355 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004356 base64bits -= 16;
4357 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4358 if (surrogate) {
4359 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004360 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4361 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004362 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4363 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004364 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004365 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004366 }
4367 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004368 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4369 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004370 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004371 }
4372 }
Victor Stinner551ac952011-11-29 22:58:13 +01004373 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 /* first surrogate */
4375 surrogate = outCh;
4376 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004377 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004378 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4379 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004380 }
4381 }
4382 }
4383 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004384 inShift = 0;
4385 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004386 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004387 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4388 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004389 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004390 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004391 if (base64bits > 0) { /* left-over bits */
4392 if (base64bits >= 6) {
4393 /* We've seen at least one base-64 character */
4394 errmsg = "partial character in shift sequence";
4395 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004396 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004397 else {
4398 /* Some bits remain; they should be zero */
4399 if (base64buffer != 0) {
4400 errmsg = "non-zero padding bits in shift sequence";
4401 goto utf7Error;
4402 }
4403 }
4404 }
4405 if (ch != '-') {
4406 /* '-' is absorbed; other terminating
4407 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004408 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4409 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004411 }
4412 }
4413 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004414 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 s++; /* consume '+' */
4416 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004418 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4419 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004420 }
4421 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004422 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004423 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004424 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004425 }
4426 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004427 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004428 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4429 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004430 s++;
4431 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004432 else {
4433 startinpos = s-starts;
4434 s++;
4435 errmsg = "unexpected special character";
4436 goto utf7Error;
4437 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004439utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 endinpos = s-starts;
4441 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004442 errors, &errorHandler,
4443 "utf7", errmsg,
4444 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004445 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004446 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004447 }
4448
Antoine Pitrou244651a2009-05-04 18:56:13 +00004449 /* end of string */
4450
4451 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4452 /* if we're in an inconsistent state, that's an error */
4453 if (surrogate ||
4454 (base64bits >= 6) ||
4455 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456 endinpos = size;
4457 if (unicode_decode_call_errorhandler(
4458 errors, &errorHandler,
4459 "utf7", "unterminated shift sequence",
4460 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004461 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462 goto onError;
4463 if (s < e)
4464 goto restart;
4465 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004466 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467
4468 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004469 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004471 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004472 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473 }
4474 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004475 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004476 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004477 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004478
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004479 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480 goto onError;
4481
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004482 Py_XDECREF(errorHandler);
4483 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004484 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004485
Benjamin Peterson29060642009-01-31 22:14:21 +00004486 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004487 Py_XDECREF(errorHandler);
4488 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004489 Py_DECREF(unicode);
4490 return NULL;
4491}
4492
4493
Alexander Belopolsky40018472011-02-26 01:02:56 +00004494PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004495_PyUnicode_EncodeUTF7(PyObject *str,
4496 int base64SetO,
4497 int base64WhiteSpace,
4498 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004499{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004500 int kind;
4501 void *data;
4502 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004503 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004504 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004505 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004506 unsigned int base64bits = 0;
4507 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004508 char * out;
4509 char * start;
4510
Benjamin Petersonbac79492012-01-14 13:34:47 -05004511 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004512 return NULL;
4513 kind = PyUnicode_KIND(str);
4514 data = PyUnicode_DATA(str);
4515 len = PyUnicode_GET_LENGTH(str);
4516
4517 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004520 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004521 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004522 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004523 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004524 if (v == NULL)
4525 return NULL;
4526
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004527 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004528 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004529 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530
Antoine Pitrou244651a2009-05-04 18:56:13 +00004531 if (inShift) {
4532 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4533 /* shifting out */
4534 if (base64bits) { /* output remaining bits */
4535 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4536 base64buffer = 0;
4537 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538 }
4539 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540 /* Characters not in the BASE64 set implicitly unshift the sequence
4541 so no '-' is required, except if the character is itself a '-' */
4542 if (IS_BASE64(ch) || ch == '-') {
4543 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 *out++ = (char) ch;
4546 }
4547 else {
4548 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004549 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 else { /* not in a shift sequence */
4552 if (ch == '+') {
4553 *out++ = '+';
4554 *out++ = '-';
4555 }
4556 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4557 *out++ = (char) ch;
4558 }
4559 else {
4560 *out++ = '+';
4561 inShift = 1;
4562 goto encode_char;
4563 }
4564 }
4565 continue;
4566encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004568 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004569
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 /* code first surrogate */
4571 base64bits += 16;
4572 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4573 while (base64bits >= 6) {
4574 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4575 base64bits -= 6;
4576 }
4577 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004578 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580 base64bits += 16;
4581 base64buffer = (base64buffer << 16) | ch;
4582 while (base64bits >= 6) {
4583 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4584 base64bits -= 6;
4585 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004586 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004587 if (base64bits)
4588 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4589 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004590 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004591 if (_PyBytes_Resize(&v, out - start) < 0)
4592 return NULL;
4593 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004594}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004595PyObject *
4596PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4597 Py_ssize_t size,
4598 int base64SetO,
4599 int base64WhiteSpace,
4600 const char *errors)
4601{
4602 PyObject *result;
4603 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4604 if (tmp == NULL)
4605 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004606 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004607 base64WhiteSpace, errors);
4608 Py_DECREF(tmp);
4609 return result;
4610}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004611
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612#undef IS_BASE64
4613#undef FROM_BASE64
4614#undef TO_BASE64
4615#undef DECODE_DIRECT
4616#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004617
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618/* --- UTF-8 Codec -------------------------------------------------------- */
4619
Alexander Belopolsky40018472011-02-26 01:02:56 +00004620PyObject *
4621PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004622 Py_ssize_t size,
4623 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624{
Walter Dörwald69652032004-09-07 20:24:22 +00004625 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4626}
4627
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004628#include "stringlib/asciilib.h"
4629#include "stringlib/codecs.h"
4630#include "stringlib/undef.h"
4631
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004632#include "stringlib/ucs1lib.h"
4633#include "stringlib/codecs.h"
4634#include "stringlib/undef.h"
4635
4636#include "stringlib/ucs2lib.h"
4637#include "stringlib/codecs.h"
4638#include "stringlib/undef.h"
4639
4640#include "stringlib/ucs4lib.h"
4641#include "stringlib/codecs.h"
4642#include "stringlib/undef.h"
4643
Antoine Pitrouab868312009-01-10 15:40:25 +00004644/* Mask to quickly check whether a C 'long' contains a
4645 non-ASCII, UTF8-encoded char. */
4646#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004647# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004648#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004649# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004650#else
4651# error C 'long' size should be either 4 or 8!
4652#endif
4653
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004654static Py_ssize_t
4655ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004656{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004657 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004658 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004659
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004660#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004661 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4662 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004663 /* Fast path, see in STRINGLIB(utf8_decode) for
4664 an explanation. */
4665 /* Help register allocation */
4666 register const char *_p = p;
4667 register Py_UCS1 * q = dest;
4668 while (_p < aligned_end) {
4669 unsigned long value = *(const unsigned long *) _p;
4670 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004671 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004672 *((unsigned long *)q) = value;
4673 _p += SIZEOF_LONG;
4674 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004675 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004676 p = _p;
4677 while (p < end) {
4678 if ((unsigned char)*p & 0x80)
4679 break;
4680 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004682 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004684#endif
4685 while (p < end) {
4686 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4687 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004688 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004689 /* Help register allocation */
4690 register const char *_p = p;
4691 while (_p < aligned_end) {
4692 unsigned long value = *(unsigned long *) _p;
4693 if (value & ASCII_CHAR_MASK)
4694 break;
4695 _p += SIZEOF_LONG;
4696 }
4697 p = _p;
4698 if (_p == end)
4699 break;
4700 }
4701 if ((unsigned char)*p & 0x80)
4702 break;
4703 ++p;
4704 }
4705 memcpy(dest, start, p - start);
4706 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707}
Antoine Pitrouab868312009-01-10 15:40:25 +00004708
Victor Stinner785938e2011-12-11 20:09:03 +01004709PyObject *
4710PyUnicode_DecodeUTF8Stateful(const char *s,
4711 Py_ssize_t size,
4712 const char *errors,
4713 Py_ssize_t *consumed)
4714{
Victor Stinner785938e2011-12-11 20:09:03 +01004715 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004716 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004717 const char *end = s + size;
4718 Py_ssize_t outpos;
4719
4720 Py_ssize_t startinpos;
4721 Py_ssize_t endinpos;
4722 const char *errmsg = "";
4723 PyObject *errorHandler = NULL;
4724 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004725
4726 if (size == 0) {
4727 if (consumed)
4728 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004729 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004730 }
4731
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004732 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4733 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004734 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004735 *consumed = 1;
4736 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004737 }
4738
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004739 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004740 if (!unicode)
4741 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004742
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004743 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4744 s += outpos;
4745 while (s < end) {
4746 Py_UCS4 ch;
4747 int kind = PyUnicode_KIND(unicode);
4748 if (kind == PyUnicode_1BYTE_KIND) {
4749 if (PyUnicode_IS_ASCII(unicode))
4750 ch = asciilib_utf8_decode(&s, end,
4751 PyUnicode_1BYTE_DATA(unicode), &outpos);
4752 else
4753 ch = ucs1lib_utf8_decode(&s, end,
4754 PyUnicode_1BYTE_DATA(unicode), &outpos);
4755 } else if (kind == PyUnicode_2BYTE_KIND) {
4756 ch = ucs2lib_utf8_decode(&s, end,
4757 PyUnicode_2BYTE_DATA(unicode), &outpos);
4758 } else {
4759 assert(kind == PyUnicode_4BYTE_KIND);
4760 ch = ucs4lib_utf8_decode(&s, end,
4761 PyUnicode_4BYTE_DATA(unicode), &outpos);
4762 }
4763
4764 switch (ch) {
4765 case 0:
4766 if (s == end || consumed)
4767 goto End;
4768 errmsg = "unexpected end of data";
4769 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004770 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004771 break;
4772 case 1:
4773 errmsg = "invalid start byte";
4774 startinpos = s - starts;
4775 endinpos = startinpos + 1;
4776 break;
4777 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004778 case 3:
4779 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004780 errmsg = "invalid continuation byte";
4781 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004782 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004783 break;
4784 default:
4785 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4786 goto onError;
4787 continue;
4788 }
4789
4790 if (unicode_decode_call_errorhandler(
4791 errors, &errorHandler,
4792 "utf-8", errmsg,
4793 &starts, &end, &startinpos, &endinpos, &exc, &s,
4794 &unicode, &outpos))
4795 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004796 }
4797
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004798End:
4799 if (unicode_resize(&unicode, outpos) < 0)
4800 goto onError;
4801
4802 if (consumed)
4803 *consumed = s - starts;
4804
4805 Py_XDECREF(errorHandler);
4806 Py_XDECREF(exc);
4807 assert(_PyUnicode_CheckConsistency(unicode, 1));
4808 return unicode;
4809
4810onError:
4811 Py_XDECREF(errorHandler);
4812 Py_XDECREF(exc);
4813 Py_XDECREF(unicode);
4814 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004815}
4816
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004817#ifdef __APPLE__
4818
4819/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner27b1ca22012-12-03 12:47:59 +01004820 used to decode the command line arguments on Mac OS X.
4821
4822 Return a pointer to a newly allocated wide character string (use
4823 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004824
4825wchar_t*
4826_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4827{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004828 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004829 wchar_t *unicode;
4830 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004831
4832 /* Note: size will always be longer than the resulting Unicode
4833 character count */
Victor Stinner27b1ca22012-12-03 12:47:59 +01004834 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004835 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004836 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4837 if (!unicode)
4838 return NULL;
4839
4840 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004841 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004842 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004843 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004844 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004845#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004846 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004847#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004848 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004849#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004850 if (ch > 0xFF) {
4851#if SIZEOF_WCHAR_T == 4
4852 assert(0);
4853#else
4854 assert(Py_UNICODE_IS_SURROGATE(ch));
4855 /* compute and append the two surrogates: */
4856 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4857 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4858#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004859 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004860 else {
4861 if (!ch && s == e)
4862 break;
4863 /* surrogateescape */
4864 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4865 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004866 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004867 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004868 return unicode;
4869}
4870
4871#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004873/* Primary internal function which creates utf8 encoded bytes objects.
4874
4875 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004876 and allocate exactly as much space needed at the end. Else allocate the
4877 maximum possible needed (4 result bytes per Unicode character), and return
4878 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004879*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004880PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004881_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882{
Victor Stinner6099a032011-12-18 14:22:26 +01004883 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004884 void *data;
4885 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004887 if (!PyUnicode_Check(unicode)) {
4888 PyErr_BadArgument();
4889 return NULL;
4890 }
4891
4892 if (PyUnicode_READY(unicode) == -1)
4893 return NULL;
4894
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004895 if (PyUnicode_UTF8(unicode))
4896 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4897 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004898
4899 kind = PyUnicode_KIND(unicode);
4900 data = PyUnicode_DATA(unicode);
4901 size = PyUnicode_GET_LENGTH(unicode);
4902
Benjamin Petersonead6b532011-12-20 17:23:42 -06004903 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004904 default:
4905 assert(0);
4906 case PyUnicode_1BYTE_KIND:
4907 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4908 assert(!PyUnicode_IS_ASCII(unicode));
4909 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4910 case PyUnicode_2BYTE_KIND:
4911 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4912 case PyUnicode_4BYTE_KIND:
4913 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004914 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915}
4916
Alexander Belopolsky40018472011-02-26 01:02:56 +00004917PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004918PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4919 Py_ssize_t size,
4920 const char *errors)
4921{
4922 PyObject *v, *unicode;
4923
4924 unicode = PyUnicode_FromUnicode(s, size);
4925 if (unicode == NULL)
4926 return NULL;
4927 v = _PyUnicode_AsUTF8String(unicode, errors);
4928 Py_DECREF(unicode);
4929 return v;
4930}
4931
4932PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004933PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004935 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936}
4937
Walter Dörwald41980ca2007-08-16 21:55:45 +00004938/* --- UTF-32 Codec ------------------------------------------------------- */
4939
4940PyObject *
4941PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004942 Py_ssize_t size,
4943 const char *errors,
4944 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004945{
4946 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4947}
4948
4949PyObject *
4950PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004951 Py_ssize_t size,
4952 const char *errors,
4953 int *byteorder,
4954 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004955{
4956 const char *starts = s;
4957 Py_ssize_t startinpos;
4958 Py_ssize_t endinpos;
4959 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004960 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004961 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004962 int bo = 0; /* assume native ordering by default */
4963 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004964 /* Offsets from q for retrieving bytes in the right order. */
4965#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4966 int iorder[] = {0, 1, 2, 3};
4967#else
4968 int iorder[] = {3, 2, 1, 0};
4969#endif
4970 PyObject *errorHandler = NULL;
4971 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004972
Walter Dörwald41980ca2007-08-16 21:55:45 +00004973 q = (unsigned char *)s;
4974 e = q + size;
4975
4976 if (byteorder)
4977 bo = *byteorder;
4978
4979 /* Check for BOM marks (U+FEFF) in the input and adjust current
4980 byte order setting accordingly. In native mode, the leading BOM
4981 mark is skipped, in all other modes, it is copied to the output
4982 stream as-is (giving a ZWNBSP character). */
4983 if (bo == 0) {
4984 if (size >= 4) {
4985 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004987#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004988 if (bom == 0x0000FEFF) {
4989 q += 4;
4990 bo = -1;
4991 }
4992 else if (bom == 0xFFFE0000) {
4993 q += 4;
4994 bo = 1;
4995 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004996#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004997 if (bom == 0x0000FEFF) {
4998 q += 4;
4999 bo = 1;
5000 }
5001 else if (bom == 0xFFFE0000) {
5002 q += 4;
5003 bo = -1;
5004 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005005#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005006 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005007 }
5008
5009 if (bo == -1) {
5010 /* force LE */
5011 iorder[0] = 0;
5012 iorder[1] = 1;
5013 iorder[2] = 2;
5014 iorder[3] = 3;
5015 }
5016 else if (bo == 1) {
5017 /* force BE */
5018 iorder[0] = 3;
5019 iorder[1] = 2;
5020 iorder[2] = 1;
5021 iorder[3] = 0;
5022 }
5023
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005024 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005025 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005026 if (!unicode)
5027 return NULL;
5028 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005029 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005030 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005031
Walter Dörwald41980ca2007-08-16 21:55:45 +00005032 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 Py_UCS4 ch;
5034 /* remaining bytes at the end? (size should be divisible by 4) */
5035 if (e-q<4) {
5036 if (consumed)
5037 break;
5038 errmsg = "truncated data";
5039 startinpos = ((const char *)q)-starts;
5040 endinpos = ((const char *)e)-starts;
5041 goto utf32Error;
5042 /* The remaining input chars are ignored if the callback
5043 chooses to skip the input */
5044 }
5045 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5046 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005047
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 if (ch >= 0x110000)
5049 {
5050 errmsg = "codepoint not in range(0x110000)";
5051 startinpos = ((const char *)q)-starts;
5052 endinpos = startinpos+4;
5053 goto utf32Error;
5054 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005055 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5056 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005057 q += 4;
5058 continue;
5059 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 if (unicode_decode_call_errorhandler(
5061 errors, &errorHandler,
5062 "utf32", errmsg,
5063 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005064 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005066 }
5067
5068 if (byteorder)
5069 *byteorder = bo;
5070
5071 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005073
5074 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005075 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005076 goto onError;
5077
5078 Py_XDECREF(errorHandler);
5079 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005080 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005081
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005083 Py_DECREF(unicode);
5084 Py_XDECREF(errorHandler);
5085 Py_XDECREF(exc);
5086 return NULL;
5087}
5088
5089PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005090_PyUnicode_EncodeUTF32(PyObject *str,
5091 const char *errors,
5092 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005093{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005094 int kind;
5095 void *data;
5096 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005097 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005099 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005100 /* Offsets from p for storing byte pairs in the right order. */
5101#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5102 int iorder[] = {0, 1, 2, 3};
5103#else
5104 int iorder[] = {3, 2, 1, 0};
5105#endif
5106
Benjamin Peterson29060642009-01-31 22:14:21 +00005107#define STORECHAR(CH) \
5108 do { \
5109 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5110 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5111 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5112 p[iorder[0]] = (CH) & 0xff; \
5113 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005114 } while(0)
5115
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005116 if (!PyUnicode_Check(str)) {
5117 PyErr_BadArgument();
5118 return NULL;
5119 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005120 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005121 return NULL;
5122 kind = PyUnicode_KIND(str);
5123 data = PyUnicode_DATA(str);
5124 len = PyUnicode_GET_LENGTH(str);
5125
5126 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005127 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005128 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005129 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005130 if (v == NULL)
5131 return NULL;
5132
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005133 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005134 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005135 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005136 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005137 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005138
5139 if (byteorder == -1) {
5140 /* force LE */
5141 iorder[0] = 0;
5142 iorder[1] = 1;
5143 iorder[2] = 2;
5144 iorder[3] = 3;
5145 }
5146 else if (byteorder == 1) {
5147 /* force BE */
5148 iorder[0] = 3;
5149 iorder[1] = 2;
5150 iorder[2] = 1;
5151 iorder[3] = 0;
5152 }
5153
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005154 for (i = 0; i < len; i++)
5155 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005156
5157 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005158 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005159#undef STORECHAR
5160}
5161
Alexander Belopolsky40018472011-02-26 01:02:56 +00005162PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005163PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5164 Py_ssize_t size,
5165 const char *errors,
5166 int byteorder)
5167{
5168 PyObject *result;
5169 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5170 if (tmp == NULL)
5171 return NULL;
5172 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5173 Py_DECREF(tmp);
5174 return result;
5175}
5176
5177PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005178PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005179{
Victor Stinnerb960b342011-11-20 19:12:52 +01005180 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005181}
5182
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183/* --- UTF-16 Codec ------------------------------------------------------- */
5184
Tim Peters772747b2001-08-09 22:21:55 +00005185PyObject *
5186PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005187 Py_ssize_t size,
5188 const char *errors,
5189 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190{
Walter Dörwald69652032004-09-07 20:24:22 +00005191 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5192}
5193
5194PyObject *
5195PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 Py_ssize_t size,
5197 const char *errors,
5198 int *byteorder,
5199 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005200{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005201 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005202 Py_ssize_t startinpos;
5203 Py_ssize_t endinpos;
5204 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005205 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005206 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005207 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005208 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005209 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005210 PyObject *errorHandler = NULL;
5211 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212
Tim Peters772747b2001-08-09 22:21:55 +00005213 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005214 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
5216 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005217 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005219 /* Check for BOM marks (U+FEFF) in the input and adjust current
5220 byte order setting accordingly. In native mode, the leading BOM
5221 mark is skipped, in all other modes, it is copied to the output
5222 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005223 if (bo == 0 && size >= 2) {
5224 const Py_UCS4 bom = (q[1] << 8) | q[0];
5225 if (bom == 0xFEFF) {
5226 q += 2;
5227 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005229 else if (bom == 0xFFFE) {
5230 q += 2;
5231 bo = 1;
5232 }
5233 if (byteorder)
5234 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236
Antoine Pitrou63065d72012-05-15 23:48:04 +02005237 if (q == e) {
5238 if (consumed)
5239 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005240 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005241 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005242
Antoine Pitrouab868312009-01-10 15:40:25 +00005243#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005244 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005245#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005246 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005247#endif
Tim Peters772747b2001-08-09 22:21:55 +00005248
Antoine Pitrou63065d72012-05-15 23:48:04 +02005249 /* Note: size will always be longer than the resulting Unicode
5250 character count */
5251 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5252 if (!unicode)
5253 return NULL;
5254
5255 outpos = 0;
5256 while (1) {
5257 Py_UCS4 ch = 0;
5258 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005259 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005260 if (kind == PyUnicode_1BYTE_KIND) {
5261 if (PyUnicode_IS_ASCII(unicode))
5262 ch = asciilib_utf16_decode(&q, e,
5263 PyUnicode_1BYTE_DATA(unicode), &outpos,
5264 native_ordering);
5265 else
5266 ch = ucs1lib_utf16_decode(&q, e,
5267 PyUnicode_1BYTE_DATA(unicode), &outpos,
5268 native_ordering);
5269 } else if (kind == PyUnicode_2BYTE_KIND) {
5270 ch = ucs2lib_utf16_decode(&q, e,
5271 PyUnicode_2BYTE_DATA(unicode), &outpos,
5272 native_ordering);
5273 } else {
5274 assert(kind == PyUnicode_4BYTE_KIND);
5275 ch = ucs4lib_utf16_decode(&q, e,
5276 PyUnicode_4BYTE_DATA(unicode), &outpos,
5277 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005278 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005279 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005280
Antoine Pitrou63065d72012-05-15 23:48:04 +02005281 switch (ch)
5282 {
5283 case 0:
5284 /* remaining byte at the end? (size should be even) */
5285 if (q == e || consumed)
5286 goto End;
5287 errmsg = "truncated data";
5288 startinpos = ((const char *)q) - starts;
5289 endinpos = ((const char *)e) - starts;
5290 break;
5291 /* The remaining input chars are ignored if the callback
5292 chooses to skip the input */
5293 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005294 q -= 2;
5295 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005296 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005297 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005298 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005299 endinpos = ((const char *)e) - starts;
5300 break;
5301 case 2:
5302 errmsg = "illegal encoding";
5303 startinpos = ((const char *)q) - 2 - starts;
5304 endinpos = startinpos + 2;
5305 break;
5306 case 3:
5307 errmsg = "illegal UTF-16 surrogate";
5308 startinpos = ((const char *)q) - 4 - starts;
5309 endinpos = startinpos + 2;
5310 break;
5311 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005312 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5313 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005314 continue;
5315 }
5316
Benjamin Peterson29060642009-01-31 22:14:21 +00005317 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005318 errors,
5319 &errorHandler,
5320 "utf16", errmsg,
5321 &starts,
5322 (const char **)&e,
5323 &startinpos,
5324 &endinpos,
5325 &exc,
5326 (const char **)&q,
5327 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005328 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 }
5331
Antoine Pitrou63065d72012-05-15 23:48:04 +02005332End:
Walter Dörwald69652032004-09-07 20:24:22 +00005333 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005335
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005337 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 goto onError;
5339
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005340 Py_XDECREF(errorHandler);
5341 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005342 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005346 Py_XDECREF(errorHandler);
5347 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 return NULL;
5349}
5350
Tim Peters772747b2001-08-09 22:21:55 +00005351PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005352_PyUnicode_EncodeUTF16(PyObject *str,
5353 const char *errors,
5354 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005356 enum PyUnicode_Kind kind;
5357 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005358 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005359 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005360 unsigned short *out;
5361 Py_ssize_t bytesize;
5362 Py_ssize_t pairs;
5363#ifdef WORDS_BIGENDIAN
5364 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005365#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005366 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005367#endif
5368
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005369 if (!PyUnicode_Check(str)) {
5370 PyErr_BadArgument();
5371 return NULL;
5372 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005373 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005374 return NULL;
5375 kind = PyUnicode_KIND(str);
5376 data = PyUnicode_DATA(str);
5377 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005378
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005379 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005380 if (kind == PyUnicode_4BYTE_KIND) {
5381 const Py_UCS4 *in = (const Py_UCS4 *)data;
5382 const Py_UCS4 *end = in + len;
5383 while (in < end)
5384 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005385 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005386 }
5387 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005389 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005390 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 if (v == NULL)
5392 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005394 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005395 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005396 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005398 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005399 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005400 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005401
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005402 switch (kind) {
5403 case PyUnicode_1BYTE_KIND: {
5404 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5405 break;
Tim Peters772747b2001-08-09 22:21:55 +00005406 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005407 case PyUnicode_2BYTE_KIND: {
5408 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5409 break;
Tim Peters772747b2001-08-09 22:21:55 +00005410 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005411 case PyUnicode_4BYTE_KIND: {
5412 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5413 break;
5414 }
5415 default:
5416 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005417 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005418
5419 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005420 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421}
5422
Alexander Belopolsky40018472011-02-26 01:02:56 +00005423PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005424PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5425 Py_ssize_t size,
5426 const char *errors,
5427 int byteorder)
5428{
5429 PyObject *result;
5430 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5431 if (tmp == NULL)
5432 return NULL;
5433 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5434 Py_DECREF(tmp);
5435 return result;
5436}
5437
5438PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005439PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005441 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442}
5443
5444/* --- Unicode Escape Codec ----------------------------------------------- */
5445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005446/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5447 if all the escapes in the string make it still a valid ASCII string.
5448 Returns -1 if any escapes were found which cause the string to
5449 pop out of ASCII range. Otherwise returns the length of the
5450 required buffer to hold the string.
5451 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005452static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005453length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5454{
5455 const unsigned char *p = (const unsigned char *)s;
5456 const unsigned char *end = p + size;
5457 Py_ssize_t length = 0;
5458
5459 if (size < 0)
5460 return -1;
5461
5462 for (; p < end; ++p) {
5463 if (*p > 127) {
5464 /* Non-ASCII */
5465 return -1;
5466 }
5467 else if (*p != '\\') {
5468 /* Normal character */
5469 ++length;
5470 }
5471 else {
5472 /* Backslash-escape, check next char */
5473 ++p;
5474 /* Escape sequence reaches till end of string or
5475 non-ASCII follow-up. */
5476 if (p >= end || *p > 127)
5477 return -1;
5478 switch (*p) {
5479 case '\n':
5480 /* backslash + \n result in zero characters */
5481 break;
5482 case '\\': case '\'': case '\"':
5483 case 'b': case 'f': case 't':
5484 case 'n': case 'r': case 'v': case 'a':
5485 ++length;
5486 break;
5487 case '0': case '1': case '2': case '3':
5488 case '4': case '5': case '6': case '7':
5489 case 'x': case 'u': case 'U': case 'N':
5490 /* these do not guarantee ASCII characters */
5491 return -1;
5492 default:
5493 /* count the backslash + the other character */
5494 length += 2;
5495 }
5496 }
5497 }
5498 return length;
5499}
5500
Fredrik Lundh06d12682001-01-24 07:59:11 +00005501static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005502
Alexander Belopolsky40018472011-02-26 01:02:56 +00005503PyObject *
5504PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005505 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005506 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005508 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005509 Py_ssize_t startinpos;
5510 Py_ssize_t endinpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005511 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005513 char* message;
5514 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515 PyObject *errorHandler = NULL;
5516 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005517 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005518 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005519
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005520 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005521
5522 /* After length_of_escaped_ascii_string() there are two alternatives,
5523 either the string is pure ASCII with named escapes like \n, etc.
5524 and we determined it's exact size (common case)
5525 or it contains \x, \u, ... escape sequences. then we create a
5526 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005527 if (len >= 0) {
5528 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005529 if (!v)
5530 goto onError;
5531 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005532 }
5533 else {
5534 /* Escaped strings will always be longer than the resulting
5535 Unicode string, so we start with size here and then reduce the
5536 length after conversion to the true value.
5537 (but if the error callback returns a long replacement string
5538 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005539 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005540 if (!v)
5541 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005542 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005543 }
5544
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005546 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005547 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005549
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550 while (s < end) {
5551 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005552 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005553 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005555 /* The only case in which i == ascii_length is a backslash
5556 followed by a newline. */
5557 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005558
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 /* Non-escape characters are interpreted as Unicode ordinals */
5560 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005561 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5562 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 continue;
5564 }
5565
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005566 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 /* \ - Escapes */
5568 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005569 c = *s++;
5570 if (s > end)
5571 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005572
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005573 /* The only case in which i == ascii_length is a backslash
5574 followed by a newline. */
5575 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005576
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005577 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005580#define WRITECHAR(ch) \
5581 do { \
5582 if (unicode_putchar(&v, &i, ch) < 0) \
5583 goto onError; \
5584 }while(0)
5585
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005587 case '\\': WRITECHAR('\\'); break;
5588 case '\'': WRITECHAR('\''); break;
5589 case '\"': WRITECHAR('\"'); break;
5590 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005591 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005592 case 'f': WRITECHAR('\014'); break;
5593 case 't': WRITECHAR('\t'); break;
5594 case 'n': WRITECHAR('\n'); break;
5595 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005596 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005597 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005598 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005599 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602 case '0': case '1': case '2': case '3':
5603 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005604 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005605 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005606 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005607 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005608 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005610 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 break;
5612
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 /* hex escapes */
5614 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005616 digits = 2;
5617 message = "truncated \\xXX escape";
5618 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005622 digits = 4;
5623 message = "truncated \\uXXXX escape";
5624 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005627 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005628 digits = 8;
5629 message = "truncated \\UXXXXXXXX escape";
5630 hexescape:
5631 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005632 if (end - s < digits) {
5633 /* count only hex digits */
5634 for (; s < end; ++s) {
5635 c = (unsigned char)*s;
5636 if (!Py_ISXDIGIT(c))
5637 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005638 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005639 goto error;
5640 }
5641 for (; digits--; ++s) {
5642 c = (unsigned char)*s;
5643 if (!Py_ISXDIGIT(c))
5644 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005645 chr = (chr<<4) & ~0xF;
5646 if (c >= '0' && c <= '9')
5647 chr += c - '0';
5648 else if (c >= 'a' && c <= 'f')
5649 chr += 10 + c - 'a';
5650 else
5651 chr += 10 + c - 'A';
5652 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005653 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005654 /* _decoding_error will have already written into the
5655 target buffer. */
5656 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005657 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005658 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005659 message = "illegal Unicode character";
5660 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005661 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005662 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005663 break;
5664
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005666 case 'N':
5667 message = "malformed \\N character escape";
5668 if (ucnhash_CAPI == NULL) {
5669 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005670 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5671 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005672 if (ucnhash_CAPI == NULL)
5673 goto ucnhashError;
5674 }
5675 if (*s == '{') {
5676 const char *start = s+1;
5677 /* look for the closing brace */
5678 while (*s != '}' && s < end)
5679 s++;
5680 if (s > start && s < end && *s == '}') {
5681 /* found a name. look it up in the unicode database */
5682 message = "unknown Unicode character name";
5683 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005684 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005685 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005686 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005687 goto store;
5688 }
5689 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005690 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005691
5692 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005693 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005694 message = "\\ at end of string";
5695 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005696 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005697 }
5698 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005699 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005700 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005701 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005702 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005704 continue;
5705
5706 error:
5707 endinpos = s-starts;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005708 if (unicode_decode_call_errorhandler(
5709 errors, &errorHandler,
5710 "unicodeescape", message,
5711 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005712 &v, &i))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005713 goto onError;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005714 len = PyUnicode_GET_LENGTH(v);
Serhiy Storchakad6793772013-01-29 10:20:44 +02005715 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005717#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005718
Victor Stinner16e6a802011-12-12 13:24:15 +01005719 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005720 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005721 Py_XDECREF(errorHandler);
5722 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005723 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005724
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005726 PyErr_SetString(
5727 PyExc_UnicodeError,
5728 "\\N escapes not supported (can't load unicodedata module)"
5729 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005730 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005731 Py_XDECREF(errorHandler);
5732 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005733 return NULL;
5734
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737 Py_XDECREF(errorHandler);
5738 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 return NULL;
5740}
5741
5742/* Return a Unicode-Escape string version of the Unicode object.
5743
5744 If quotes is true, the string is enclosed in u"" or u'' quotes as
5745 appropriate.
5746
5747*/
5748
Alexander Belopolsky40018472011-02-26 01:02:56 +00005749PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005750PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005752 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005753 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005755 int kind;
5756 void *data;
5757 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758
Ezio Melottie7f90372012-10-05 03:33:31 +03005759 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005760 escape.
5761
Ezio Melottie7f90372012-10-05 03:33:31 +03005762 For UCS1 strings it's '\xxx', 4 bytes per source character.
5763 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5764 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005765 */
5766
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005767 if (!PyUnicode_Check(unicode)) {
5768 PyErr_BadArgument();
5769 return NULL;
5770 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005771 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005772 return NULL;
5773 len = PyUnicode_GET_LENGTH(unicode);
5774 kind = PyUnicode_KIND(unicode);
5775 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005776 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005777 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5778 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5779 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5780 }
5781
5782 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005783 return PyBytes_FromStringAndSize(NULL, 0);
5784
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005785 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005787
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005788 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005789 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005790 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792 if (repr == NULL)
5793 return NULL;
5794
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005795 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005797 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005798 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005799
Walter Dörwald79e913e2007-05-12 11:08:06 +00005800 /* Escape backslashes */
5801 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 *p++ = '\\';
5803 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005804 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005805 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005806
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005807 /* Map 21-bit characters to '\U00xxxxxx' */
5808 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005809 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005810 *p++ = '\\';
5811 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005812 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5813 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5814 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5815 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5816 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5817 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5818 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5819 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005821 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005822
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005824 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 *p++ = '\\';
5826 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005827 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5828 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5829 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5830 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005832
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005833 /* Map special whitespace to '\t', \n', '\r' */
5834 else if (ch == '\t') {
5835 *p++ = '\\';
5836 *p++ = 't';
5837 }
5838 else if (ch == '\n') {
5839 *p++ = '\\';
5840 *p++ = 'n';
5841 }
5842 else if (ch == '\r') {
5843 *p++ = '\\';
5844 *p++ = 'r';
5845 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005846
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005847 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005848 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005850 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005851 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5852 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005853 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005854
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855 /* Copy everything else as-is */
5856 else
5857 *p++ = (char) ch;
5858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005860 assert(p - PyBytes_AS_STRING(repr) > 0);
5861 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5862 return NULL;
5863 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864}
5865
Alexander Belopolsky40018472011-02-26 01:02:56 +00005866PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005867PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5868 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005870 PyObject *result;
5871 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5872 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005874 result = PyUnicode_AsUnicodeEscapeString(tmp);
5875 Py_DECREF(tmp);
5876 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877}
5878
5879/* --- Raw Unicode Escape Codec ------------------------------------------- */
5880
Alexander Belopolsky40018472011-02-26 01:02:56 +00005881PyObject *
5882PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005883 Py_ssize_t size,
5884 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005886 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005887 Py_ssize_t startinpos;
5888 Py_ssize_t endinpos;
5889 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005890 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 const char *end;
5892 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005893 PyObject *errorHandler = NULL;
5894 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005895
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 /* Escaped strings will always be longer than the resulting
5897 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 length after conversion to the true value. (But decoding error
5899 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005900 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005902 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005904 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005905 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 end = s + size;
5907 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 unsigned char c;
5909 Py_UCS4 x;
5910 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005911 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 /* Non-escape characters are interpreted as Unicode ordinals */
5914 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005915 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5916 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005918 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005919 startinpos = s-starts;
5920
5921 /* \u-escapes are only interpreted iff the number of leading
5922 backslashes if odd */
5923 bs = s;
5924 for (;s < end;) {
5925 if (*s != '\\')
5926 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005927 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5928 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 }
5930 if (((s - bs) & 1) == 0 ||
5931 s >= end ||
5932 (*s != 'u' && *s != 'U')) {
5933 continue;
5934 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005935 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 count = *s=='u' ? 4 : 8;
5937 s++;
5938
5939 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 for (x = 0, i = 0; i < count; ++i, ++s) {
5941 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005942 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 endinpos = s-starts;
5944 if (unicode_decode_call_errorhandler(
5945 errors, &errorHandler,
5946 "rawunicodeescape", "truncated \\uXXXX",
5947 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005948 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 goto onError;
5950 goto nextByte;
5951 }
5952 x = (x<<4) & ~0xF;
5953 if (c >= '0' && c <= '9')
5954 x += c - '0';
5955 else if (c >= 'a' && c <= 'f')
5956 x += 10 + c - 'a';
5957 else
5958 x += 10 + c - 'A';
5959 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005960 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005961 if (unicode_putchar(&v, &outpos, x) < 0)
5962 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005963 } else {
5964 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005965 if (unicode_decode_call_errorhandler(
5966 errors, &errorHandler,
5967 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005969 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005971 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 nextByte:
5973 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 }
Victor Stinner16e6a802011-12-12 13:24:15 +01005975 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005977 Py_XDECREF(errorHandler);
5978 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005979 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00005980
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005983 Py_XDECREF(errorHandler);
5984 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 return NULL;
5986}
5987
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005988
Alexander Belopolsky40018472011-02-26 01:02:56 +00005989PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005990PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005992 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 char *p;
5994 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005995 Py_ssize_t expandsize, pos;
5996 int kind;
5997 void *data;
5998 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006000 if (!PyUnicode_Check(unicode)) {
6001 PyErr_BadArgument();
6002 return NULL;
6003 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006004 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006005 return NULL;
6006 kind = PyUnicode_KIND(unicode);
6007 data = PyUnicode_DATA(unicode);
6008 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006009 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6010 bytes, and 1 byte characters 4. */
6011 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006012
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006013 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006015
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006016 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 if (repr == NULL)
6018 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006019 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006020 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006022 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006023 for (pos = 0; pos < len; pos++) {
6024 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 /* Map 32-bit characters to '\Uxxxxxxxx' */
6026 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006027 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006028 *p++ = '\\';
6029 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006030 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6031 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6032 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6033 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6034 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6035 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6036 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6037 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006038 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006039 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006040 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 *p++ = '\\';
6042 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006043 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6044 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6045 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6046 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 /* Copy everything else as-is */
6049 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 *p++ = (char) ch;
6051 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006052
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006053 assert(p > q);
6054 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006055 return NULL;
6056 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057}
6058
Alexander Belopolsky40018472011-02-26 01:02:56 +00006059PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006060PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6061 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006063 PyObject *result;
6064 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6065 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006066 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006067 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6068 Py_DECREF(tmp);
6069 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070}
6071
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006072/* --- Unicode Internal Codec ------------------------------------------- */
6073
Alexander Belopolsky40018472011-02-26 01:02:56 +00006074PyObject *
6075_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006076 Py_ssize_t size,
6077 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006078{
6079 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006080 Py_ssize_t startinpos;
6081 Py_ssize_t endinpos;
6082 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006083 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006084 const char *end;
6085 const char *reason;
6086 PyObject *errorHandler = NULL;
6087 PyObject *exc = NULL;
6088
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006089 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006090 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006091 1))
6092 return NULL;
6093
Thomas Wouters89f507f2006-12-13 04:49:30 +00006094 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006095 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006096 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006098 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006099 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006100 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006101 end = s + size;
6102
6103 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006104 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006105 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006106 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006107 endinpos = end-starts;
6108 reason = "truncated input";
6109 goto error;
6110 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006111 /* We copy the raw representation one byte at a time because the
6112 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006113 ((char *) &uch)[0] = s[0];
6114 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006115#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006116 ((char *) &uch)[2] = s[2];
6117 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006118#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006119 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006120#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006121 /* We have to sanity check the raw data, otherwise doom looms for
6122 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006123 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006124 endinpos = s - starts + Py_UNICODE_SIZE;
6125 reason = "illegal code point (> 0x10FFFF)";
6126 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006127 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006128#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006129 s += Py_UNICODE_SIZE;
6130#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006131 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006132 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006133 Py_UNICODE uch2;
6134 ((char *) &uch2)[0] = s[0];
6135 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006136 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006137 {
Victor Stinner551ac952011-11-29 22:58:13 +01006138 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006139 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006140 }
6141 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006142#endif
6143
6144 if (unicode_putchar(&v, &outpos, ch) < 0)
6145 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006146 continue;
6147
6148 error:
6149 startinpos = s - starts;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006150 if (unicode_decode_call_errorhandler(
6151 errors, &errorHandler,
6152 "unicode_internal", reason,
6153 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006154 &v, &outpos))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006155 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006156 }
6157
Victor Stinner16e6a802011-12-12 13:24:15 +01006158 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006159 goto onError;
6160 Py_XDECREF(errorHandler);
6161 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006162 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006163
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006165 Py_XDECREF(v);
6166 Py_XDECREF(errorHandler);
6167 Py_XDECREF(exc);
6168 return NULL;
6169}
6170
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171/* --- Latin-1 Codec ------------------------------------------------------ */
6172
Alexander Belopolsky40018472011-02-26 01:02:56 +00006173PyObject *
6174PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006175 Py_ssize_t size,
6176 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006179 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180}
6181
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006182/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006183static void
6184make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006185 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006186 PyObject *unicode,
6187 Py_ssize_t startpos, Py_ssize_t endpos,
6188 const char *reason)
6189{
6190 if (*exceptionObject == NULL) {
6191 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006192 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006193 encoding, unicode, startpos, endpos, reason);
6194 }
6195 else {
6196 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6197 goto onError;
6198 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6199 goto onError;
6200 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6201 goto onError;
6202 return;
6203 onError:
6204 Py_DECREF(*exceptionObject);
6205 *exceptionObject = NULL;
6206 }
6207}
6208
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006209/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006210static void
6211raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006212 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006213 PyObject *unicode,
6214 Py_ssize_t startpos, Py_ssize_t endpos,
6215 const char *reason)
6216{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006217 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006218 encoding, unicode, startpos, endpos, reason);
6219 if (*exceptionObject != NULL)
6220 PyCodec_StrictErrors(*exceptionObject);
6221}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006222
6223/* error handling callback helper:
6224 build arguments, call the callback and check the arguments,
6225 put the result into newpos and return the replacement string, which
6226 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006227static PyObject *
6228unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006229 PyObject **errorHandler,
6230 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006231 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006232 Py_ssize_t startpos, Py_ssize_t endpos,
6233 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006234{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006235 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006236 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006237 PyObject *restuple;
6238 PyObject *resunicode;
6239
6240 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006242 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006243 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006244 }
6245
Benjamin Petersonbac79492012-01-14 13:34:47 -05006246 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006247 return NULL;
6248 len = PyUnicode_GET_LENGTH(unicode);
6249
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006250 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006251 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006252 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006254
6255 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006256 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006257 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006258 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006259 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006260 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006261 Py_DECREF(restuple);
6262 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006263 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006264 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006265 &resunicode, newpos)) {
6266 Py_DECREF(restuple);
6267 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006268 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006269 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6270 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6271 Py_DECREF(restuple);
6272 return NULL;
6273 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006274 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006275 *newpos = len + *newpos;
6276 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6278 Py_DECREF(restuple);
6279 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006280 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006281 Py_INCREF(resunicode);
6282 Py_DECREF(restuple);
6283 return resunicode;
6284}
6285
Alexander Belopolsky40018472011-02-26 01:02:56 +00006286static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006287unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006288 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006289 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006290{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006291 /* input state */
6292 Py_ssize_t pos=0, size;
6293 int kind;
6294 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006295 /* output object */
6296 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006297 /* pointer into the output */
6298 char *str;
6299 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006300 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006301 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6302 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006303 PyObject *errorHandler = NULL;
6304 PyObject *exc = NULL;
6305 /* the following variable is used for caching string comparisons
6306 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6307 int known_errorHandler = -1;
6308
Benjamin Petersonbac79492012-01-14 13:34:47 -05006309 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006310 return NULL;
6311 size = PyUnicode_GET_LENGTH(unicode);
6312 kind = PyUnicode_KIND(unicode);
6313 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006314 /* allocate enough for a simple encoding without
6315 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006316 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006317 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006318 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006319 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006320 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006321 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006322 ressize = size;
6323
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006324 while (pos < size) {
6325 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006326
Benjamin Peterson29060642009-01-31 22:14:21 +00006327 /* can we encode this? */
6328 if (c<limit) {
6329 /* no overflow check, because we know that the space is enough */
6330 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006331 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006332 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006333 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006334 Py_ssize_t requiredsize;
6335 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006336 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006338 Py_ssize_t collstart = pos;
6339 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006340 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006341 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006342 ++collend;
6343 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6344 if (known_errorHandler==-1) {
6345 if ((errors==NULL) || (!strcmp(errors, "strict")))
6346 known_errorHandler = 1;
6347 else if (!strcmp(errors, "replace"))
6348 known_errorHandler = 2;
6349 else if (!strcmp(errors, "ignore"))
6350 known_errorHandler = 3;
6351 else if (!strcmp(errors, "xmlcharrefreplace"))
6352 known_errorHandler = 4;
6353 else
6354 known_errorHandler = 0;
6355 }
6356 switch (known_errorHandler) {
6357 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006358 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 goto onError;
6360 case 2: /* replace */
6361 while (collstart++<collend)
6362 *str++ = '?'; /* fall through */
6363 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006364 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 break;
6366 case 4: /* xmlcharrefreplace */
6367 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006368 /* determine replacement size */
6369 for (i = collstart, repsize = 0; i < collend; ++i) {
6370 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6371 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006372 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006373 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006375 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006377 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006379 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006381 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006383 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006384 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006385 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006386 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006387 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006388 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 if (requiredsize > ressize) {
6390 if (requiredsize<2*ressize)
6391 requiredsize = 2*ressize;
6392 if (_PyBytes_Resize(&res, requiredsize))
6393 goto onError;
6394 str = PyBytes_AS_STRING(res) + respos;
6395 ressize = requiredsize;
6396 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006397 /* generate replacement */
6398 for (i = collstart; i < collend; ++i) {
6399 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006400 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006401 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 break;
6403 default:
6404 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006405 encoding, reason, unicode, &exc,
6406 collstart, collend, &newpos);
6407 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006408 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006409 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006410 if (PyBytes_Check(repunicode)) {
6411 /* Directly copy bytes result to output. */
6412 repsize = PyBytes_Size(repunicode);
6413 if (repsize > 1) {
6414 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006415 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006416 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6417 Py_DECREF(repunicode);
6418 goto onError;
6419 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006420 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006421 ressize += repsize-1;
6422 }
6423 memcpy(str, PyBytes_AsString(repunicode), repsize);
6424 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006425 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006426 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006427 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006428 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006429 /* need more space? (at least enough for what we
6430 have+the replacement+the rest of the string, so
6431 we won't have to check space for encodable characters) */
6432 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006433 repsize = PyUnicode_GET_LENGTH(repunicode);
6434 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 if (requiredsize > ressize) {
6436 if (requiredsize<2*ressize)
6437 requiredsize = 2*ressize;
6438 if (_PyBytes_Resize(&res, requiredsize)) {
6439 Py_DECREF(repunicode);
6440 goto onError;
6441 }
6442 str = PyBytes_AS_STRING(res) + respos;
6443 ressize = requiredsize;
6444 }
6445 /* check if there is anything unencodable in the replacement
6446 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006447 for (i = 0; repsize-->0; ++i, ++str) {
6448 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006449 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006450 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006451 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 Py_DECREF(repunicode);
6453 goto onError;
6454 }
6455 *str = (char)c;
6456 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006457 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006458 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006459 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006460 }
6461 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006462 /* Resize if we allocated to much */
6463 size = str - PyBytes_AS_STRING(res);
6464 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006465 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006466 if (_PyBytes_Resize(&res, size) < 0)
6467 goto onError;
6468 }
6469
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006470 Py_XDECREF(errorHandler);
6471 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006472 return res;
6473
6474 onError:
6475 Py_XDECREF(res);
6476 Py_XDECREF(errorHandler);
6477 Py_XDECREF(exc);
6478 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006479}
6480
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006481/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006482PyObject *
6483PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006484 Py_ssize_t size,
6485 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006486{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006487 PyObject *result;
6488 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6489 if (unicode == NULL)
6490 return NULL;
6491 result = unicode_encode_ucs1(unicode, errors, 256);
6492 Py_DECREF(unicode);
6493 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494}
6495
Alexander Belopolsky40018472011-02-26 01:02:56 +00006496PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006497_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498{
6499 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 PyErr_BadArgument();
6501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006503 if (PyUnicode_READY(unicode) == -1)
6504 return NULL;
6505 /* Fast path: if it is a one-byte string, construct
6506 bytes object directly. */
6507 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6508 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6509 PyUnicode_GET_LENGTH(unicode));
6510 /* Non-Latin-1 characters present. Defer to above function to
6511 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006512 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006513}
6514
6515PyObject*
6516PyUnicode_AsLatin1String(PyObject *unicode)
6517{
6518 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519}
6520
6521/* --- 7-bit ASCII Codec -------------------------------------------------- */
6522
Alexander Belopolsky40018472011-02-26 01:02:56 +00006523PyObject *
6524PyUnicode_DecodeASCII(const char *s,
6525 Py_ssize_t size,
6526 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006528 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006529 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006530 int kind;
6531 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006532 Py_ssize_t startinpos;
6533 Py_ssize_t endinpos;
6534 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006535 const char *e;
6536 PyObject *errorHandler = NULL;
6537 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006538
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006540 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006541
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006543 if (size == 1 && (unsigned char)s[0] < 128)
6544 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006545
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006546 unicode = PyUnicode_New(size, 127);
6547 if (unicode == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006549
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006550 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006551 data = PyUnicode_1BYTE_DATA(unicode);
6552 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6553 if (outpos == size)
6554 return unicode;
6555
6556 s += outpos;
6557 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006558 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 register unsigned char c = (unsigned char)*s;
6560 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006561 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 ++s;
6563 }
6564 else {
6565 startinpos = s-starts;
6566 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 if (unicode_decode_call_errorhandler(
6568 errors, &errorHandler,
6569 "ascii", "ordinal not in range(128)",
6570 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006571 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006573 kind = PyUnicode_KIND(unicode);
6574 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006577 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006578 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006579 Py_XDECREF(errorHandler);
6580 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006581 assert(_PyUnicode_CheckConsistency(unicode, 1));
6582 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006583
Benjamin Peterson29060642009-01-31 22:14:21 +00006584 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006585 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006586 Py_XDECREF(errorHandler);
6587 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588 return NULL;
6589}
6590
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006591/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006592PyObject *
6593PyUnicode_EncodeASCII(const Py_UNICODE *p,
6594 Py_ssize_t size,
6595 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006596{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006597 PyObject *result;
6598 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6599 if (unicode == NULL)
6600 return NULL;
6601 result = unicode_encode_ucs1(unicode, errors, 128);
6602 Py_DECREF(unicode);
6603 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604}
6605
Alexander Belopolsky40018472011-02-26 01:02:56 +00006606PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006607_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608{
6609 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 PyErr_BadArgument();
6611 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006613 if (PyUnicode_READY(unicode) == -1)
6614 return NULL;
6615 /* Fast path: if it is an ASCII-only string, construct bytes object
6616 directly. Else defer to above function to raise the exception. */
6617 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6618 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6619 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006621}
6622
6623PyObject *
6624PyUnicode_AsASCIIString(PyObject *unicode)
6625{
6626 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627}
6628
Victor Stinner99b95382011-07-04 14:23:54 +02006629#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006630
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006631/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006632
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006633#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006634#define NEED_RETRY
6635#endif
6636
Victor Stinner3a50e702011-10-18 21:21:00 +02006637#ifndef WC_ERR_INVALID_CHARS
6638# define WC_ERR_INVALID_CHARS 0x0080
6639#endif
6640
6641static char*
6642code_page_name(UINT code_page, PyObject **obj)
6643{
6644 *obj = NULL;
6645 if (code_page == CP_ACP)
6646 return "mbcs";
6647 if (code_page == CP_UTF7)
6648 return "CP_UTF7";
6649 if (code_page == CP_UTF8)
6650 return "CP_UTF8";
6651
6652 *obj = PyBytes_FromFormat("cp%u", code_page);
6653 if (*obj == NULL)
6654 return NULL;
6655 return PyBytes_AS_STRING(*obj);
6656}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006657
Alexander Belopolsky40018472011-02-26 01:02:56 +00006658static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006659is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006660{
6661 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006662 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006663
Victor Stinner3a50e702011-10-18 21:21:00 +02006664 if (!IsDBCSLeadByteEx(code_page, *curr))
6665 return 0;
6666
6667 prev = CharPrevExA(code_page, s, curr, 0);
6668 if (prev == curr)
6669 return 1;
6670 /* FIXME: This code is limited to "true" double-byte encodings,
6671 as it assumes an incomplete character consists of a single
6672 byte. */
6673 if (curr - prev == 2)
6674 return 1;
6675 if (!IsDBCSLeadByteEx(code_page, *prev))
6676 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006677 return 0;
6678}
6679
Victor Stinner3a50e702011-10-18 21:21:00 +02006680static DWORD
6681decode_code_page_flags(UINT code_page)
6682{
6683 if (code_page == CP_UTF7) {
6684 /* The CP_UTF7 decoder only supports flags=0 */
6685 return 0;
6686 }
6687 else
6688 return MB_ERR_INVALID_CHARS;
6689}
6690
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006691/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006692 * Decode a byte string from a Windows code page into unicode object in strict
6693 * mode.
6694 *
6695 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6696 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006697 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006698static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006699decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006700 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006701 const char *in,
6702 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006703{
Victor Stinner3a50e702011-10-18 21:21:00 +02006704 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006705 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006706 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006707
6708 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006709 assert(insize > 0);
6710 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6711 if (outsize <= 0)
6712 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006713
6714 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006715 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006716 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006717 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 if (*v == NULL)
6719 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006720 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006721 }
6722 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006723 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006724 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006725 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006727 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006728 }
6729
6730 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006731 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6732 if (outsize <= 0)
6733 goto error;
6734 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006735
Victor Stinner3a50e702011-10-18 21:21:00 +02006736error:
6737 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6738 return -2;
6739 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006740 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006741}
6742
Victor Stinner3a50e702011-10-18 21:21:00 +02006743/*
6744 * Decode a byte string from a code page into unicode object with an error
6745 * handler.
6746 *
6747 * Returns consumed size if succeed, or raise a WindowsError or
6748 * UnicodeDecodeError exception and returns -1 on error.
6749 */
6750static int
6751decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006752 PyObject **v,
6753 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006754 const char *errors)
6755{
6756 const char *startin = in;
6757 const char *endin = in + size;
6758 const DWORD flags = decode_code_page_flags(code_page);
6759 /* Ideally, we should get reason from FormatMessage. This is the Windows
6760 2000 English version of the message. */
6761 const char *reason = "No mapping for the Unicode character exists "
6762 "in the target code page.";
6763 /* each step cannot decode more than 1 character, but a character can be
6764 represented as a surrogate pair */
6765 wchar_t buffer[2], *startout, *out;
6766 int insize, outsize;
6767 PyObject *errorHandler = NULL;
6768 PyObject *exc = NULL;
6769 PyObject *encoding_obj = NULL;
6770 char *encoding;
6771 DWORD err;
6772 int ret = -1;
6773
6774 assert(size > 0);
6775
6776 encoding = code_page_name(code_page, &encoding_obj);
6777 if (encoding == NULL)
6778 return -1;
6779
6780 if (errors == NULL || strcmp(errors, "strict") == 0) {
6781 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6782 UnicodeDecodeError. */
6783 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6784 if (exc != NULL) {
6785 PyCodec_StrictErrors(exc);
6786 Py_CLEAR(exc);
6787 }
6788 goto error;
6789 }
6790
6791 if (*v == NULL) {
6792 /* Create unicode object */
6793 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6794 PyErr_NoMemory();
6795 goto error;
6796 }
Victor Stinnerab595942011-12-17 04:59:06 +01006797 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006798 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006799 if (*v == NULL)
6800 goto error;
6801 startout = PyUnicode_AS_UNICODE(*v);
6802 }
6803 else {
6804 /* Extend unicode object */
6805 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6806 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6807 PyErr_NoMemory();
6808 goto error;
6809 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006810 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006811 goto error;
6812 startout = PyUnicode_AS_UNICODE(*v) + n;
6813 }
6814
6815 /* Decode the byte string character per character */
6816 out = startout;
6817 while (in < endin)
6818 {
6819 /* Decode a character */
6820 insize = 1;
6821 do
6822 {
6823 outsize = MultiByteToWideChar(code_page, flags,
6824 in, insize,
6825 buffer, Py_ARRAY_LENGTH(buffer));
6826 if (outsize > 0)
6827 break;
6828 err = GetLastError();
6829 if (err != ERROR_NO_UNICODE_TRANSLATION
6830 && err != ERROR_INSUFFICIENT_BUFFER)
6831 {
6832 PyErr_SetFromWindowsErr(0);
6833 goto error;
6834 }
6835 insize++;
6836 }
6837 /* 4=maximum length of a UTF-8 sequence */
6838 while (insize <= 4 && (in + insize) <= endin);
6839
6840 if (outsize <= 0) {
6841 Py_ssize_t startinpos, endinpos, outpos;
6842
6843 startinpos = in - startin;
6844 endinpos = startinpos + 1;
6845 outpos = out - PyUnicode_AS_UNICODE(*v);
6846 if (unicode_decode_call_errorhandler(
6847 errors, &errorHandler,
6848 encoding, reason,
6849 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006850 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006851 {
6852 goto error;
6853 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006854 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006855 }
6856 else {
6857 in += insize;
6858 memcpy(out, buffer, outsize * sizeof(wchar_t));
6859 out += outsize;
6860 }
6861 }
6862
6863 /* write a NUL character at the end */
6864 *out = 0;
6865
6866 /* Extend unicode object */
6867 outsize = out - startout;
6868 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006869 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006870 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006871 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006872
6873error:
6874 Py_XDECREF(encoding_obj);
6875 Py_XDECREF(errorHandler);
6876 Py_XDECREF(exc);
6877 return ret;
6878}
6879
Victor Stinner3a50e702011-10-18 21:21:00 +02006880static PyObject *
6881decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006882 const char *s, Py_ssize_t size,
6883 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006884{
Victor Stinner76a31a62011-11-04 00:05:13 +01006885 PyObject *v = NULL;
6886 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006887
Victor Stinner3a50e702011-10-18 21:21:00 +02006888 if (code_page < 0) {
6889 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6890 return NULL;
6891 }
6892
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006893 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006894 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006895
Victor Stinner76a31a62011-11-04 00:05:13 +01006896 do
6897 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006898#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006899 if (size > INT_MAX) {
6900 chunk_size = INT_MAX;
6901 final = 0;
6902 done = 0;
6903 }
6904 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006905#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006906 {
6907 chunk_size = (int)size;
6908 final = (consumed == NULL);
6909 done = 1;
6910 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006911
Victor Stinner76a31a62011-11-04 00:05:13 +01006912 /* Skip trailing lead-byte unless 'final' is set */
6913 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6914 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006915
Victor Stinner76a31a62011-11-04 00:05:13 +01006916 if (chunk_size == 0 && done) {
6917 if (v != NULL)
6918 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006919 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006920 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006921
Victor Stinner76a31a62011-11-04 00:05:13 +01006922
6923 converted = decode_code_page_strict(code_page, &v,
6924 s, chunk_size);
6925 if (converted == -2)
6926 converted = decode_code_page_errors(code_page, &v,
6927 s, chunk_size,
6928 errors);
6929 assert(converted != 0);
6930
6931 if (converted < 0) {
6932 Py_XDECREF(v);
6933 return NULL;
6934 }
6935
6936 if (consumed)
6937 *consumed += converted;
6938
6939 s += converted;
6940 size -= converted;
6941 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006942
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006943 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006944}
6945
Alexander Belopolsky40018472011-02-26 01:02:56 +00006946PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006947PyUnicode_DecodeCodePageStateful(int code_page,
6948 const char *s,
6949 Py_ssize_t size,
6950 const char *errors,
6951 Py_ssize_t *consumed)
6952{
6953 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6954}
6955
6956PyObject *
6957PyUnicode_DecodeMBCSStateful(const char *s,
6958 Py_ssize_t size,
6959 const char *errors,
6960 Py_ssize_t *consumed)
6961{
6962 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6963}
6964
6965PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006966PyUnicode_DecodeMBCS(const char *s,
6967 Py_ssize_t size,
6968 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006969{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006970 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6971}
6972
Victor Stinner3a50e702011-10-18 21:21:00 +02006973static DWORD
6974encode_code_page_flags(UINT code_page, const char *errors)
6975{
6976 if (code_page == CP_UTF8) {
6977 if (winver.dwMajorVersion >= 6)
6978 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6979 and later */
6980 return WC_ERR_INVALID_CHARS;
6981 else
6982 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6983 return 0;
6984 }
6985 else if (code_page == CP_UTF7) {
6986 /* CP_UTF7 only supports flags=0 */
6987 return 0;
6988 }
6989 else {
6990 if (errors != NULL && strcmp(errors, "replace") == 0)
6991 return 0;
6992 else
6993 return WC_NO_BEST_FIT_CHARS;
6994 }
6995}
6996
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006997/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006998 * Encode a Unicode string to a Windows code page into a byte string in strict
6999 * mode.
7000 *
7001 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7002 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007003 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007004static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007005encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007006 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007007 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007008{
Victor Stinner554f3f02010-06-16 23:33:54 +00007009 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007010 BOOL *pusedDefaultChar = &usedDefaultChar;
7011 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007012 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007013 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007014 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007015 const DWORD flags = encode_code_page_flags(code_page, NULL);
7016 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007017 /* Create a substring so that we can get the UTF-16 representation
7018 of just the slice under consideration. */
7019 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007020
Martin v. Löwis3d325192011-11-04 18:23:06 +01007021 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007022
Victor Stinner3a50e702011-10-18 21:21:00 +02007023 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007024 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007025 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007026 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007027
Victor Stinner2fc507f2011-11-04 20:06:39 +01007028 substring = PyUnicode_Substring(unicode, offset, offset+len);
7029 if (substring == NULL)
7030 return -1;
7031 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7032 if (p == NULL) {
7033 Py_DECREF(substring);
7034 return -1;
7035 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007036
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007037 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007038 outsize = WideCharToMultiByte(code_page, flags,
7039 p, size,
7040 NULL, 0,
7041 NULL, pusedDefaultChar);
7042 if (outsize <= 0)
7043 goto error;
7044 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007045 if (pusedDefaultChar && *pusedDefaultChar) {
7046 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007047 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007048 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007049
Victor Stinner3a50e702011-10-18 21:21:00 +02007050 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007052 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007053 if (*outbytes == NULL) {
7054 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007056 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007057 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007058 }
7059 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007060 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007061 const Py_ssize_t n = PyBytes_Size(*outbytes);
7062 if (outsize > PY_SSIZE_T_MAX - n) {
7063 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007064 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007065 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007066 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007067 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7068 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007069 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007070 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007071 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007072 }
7073
7074 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007075 outsize = WideCharToMultiByte(code_page, flags,
7076 p, size,
7077 out, outsize,
7078 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007079 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007080 if (outsize <= 0)
7081 goto error;
7082 if (pusedDefaultChar && *pusedDefaultChar)
7083 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007084 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007085
Victor Stinner3a50e702011-10-18 21:21:00 +02007086error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007087 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007088 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7089 return -2;
7090 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007091 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007092}
7093
Victor Stinner3a50e702011-10-18 21:21:00 +02007094/*
7095 * Encode a Unicode string to a Windows code page into a byte string using a
7096 * error handler.
7097 *
7098 * Returns consumed characters if succeed, or raise a WindowsError and returns
7099 * -1 on other error.
7100 */
7101static int
7102encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007103 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007104 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007105{
Victor Stinner3a50e702011-10-18 21:21:00 +02007106 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007107 Py_ssize_t pos = unicode_offset;
7108 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007109 /* Ideally, we should get reason from FormatMessage. This is the Windows
7110 2000 English version of the message. */
7111 const char *reason = "invalid character";
7112 /* 4=maximum length of a UTF-8 sequence */
7113 char buffer[4];
7114 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7115 Py_ssize_t outsize;
7116 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 PyObject *errorHandler = NULL;
7118 PyObject *exc = NULL;
7119 PyObject *encoding_obj = NULL;
7120 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007121 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007122 PyObject *rep;
7123 int ret = -1;
7124
7125 assert(insize > 0);
7126
7127 encoding = code_page_name(code_page, &encoding_obj);
7128 if (encoding == NULL)
7129 return -1;
7130
7131 if (errors == NULL || strcmp(errors, "strict") == 0) {
7132 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7133 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007134 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007135 if (exc != NULL) {
7136 PyCodec_StrictErrors(exc);
7137 Py_DECREF(exc);
7138 }
7139 Py_XDECREF(encoding_obj);
7140 return -1;
7141 }
7142
7143 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7144 pusedDefaultChar = &usedDefaultChar;
7145 else
7146 pusedDefaultChar = NULL;
7147
7148 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7149 PyErr_NoMemory();
7150 goto error;
7151 }
7152 outsize = insize * Py_ARRAY_LENGTH(buffer);
7153
7154 if (*outbytes == NULL) {
7155 /* Create string object */
7156 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7157 if (*outbytes == NULL)
7158 goto error;
7159 out = PyBytes_AS_STRING(*outbytes);
7160 }
7161 else {
7162 /* Extend string object */
7163 Py_ssize_t n = PyBytes_Size(*outbytes);
7164 if (n > PY_SSIZE_T_MAX - outsize) {
7165 PyErr_NoMemory();
7166 goto error;
7167 }
7168 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7169 goto error;
7170 out = PyBytes_AS_STRING(*outbytes) + n;
7171 }
7172
7173 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007174 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007175 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007176 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7177 wchar_t chars[2];
7178 int charsize;
7179 if (ch < 0x10000) {
7180 chars[0] = (wchar_t)ch;
7181 charsize = 1;
7182 }
7183 else {
7184 ch -= 0x10000;
7185 chars[0] = 0xd800 + (ch >> 10);
7186 chars[1] = 0xdc00 + (ch & 0x3ff);
7187 charsize = 2;
7188 }
7189
Victor Stinner3a50e702011-10-18 21:21:00 +02007190 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007191 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 buffer, Py_ARRAY_LENGTH(buffer),
7193 NULL, pusedDefaultChar);
7194 if (outsize > 0) {
7195 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7196 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007197 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 memcpy(out, buffer, outsize);
7199 out += outsize;
7200 continue;
7201 }
7202 }
7203 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7204 PyErr_SetFromWindowsErr(0);
7205 goto error;
7206 }
7207
Victor Stinner3a50e702011-10-18 21:21:00 +02007208 rep = unicode_encode_call_errorhandler(
7209 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007210 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007211 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 if (rep == NULL)
7213 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007214 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007215
7216 if (PyBytes_Check(rep)) {
7217 outsize = PyBytes_GET_SIZE(rep);
7218 if (outsize != 1) {
7219 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7220 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7221 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7222 Py_DECREF(rep);
7223 goto error;
7224 }
7225 out = PyBytes_AS_STRING(*outbytes) + offset;
7226 }
7227 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7228 out += outsize;
7229 }
7230 else {
7231 Py_ssize_t i;
7232 enum PyUnicode_Kind kind;
7233 void *data;
7234
Benjamin Petersonbac79492012-01-14 13:34:47 -05007235 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007236 Py_DECREF(rep);
7237 goto error;
7238 }
7239
7240 outsize = PyUnicode_GET_LENGTH(rep);
7241 if (outsize != 1) {
7242 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7243 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7244 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7245 Py_DECREF(rep);
7246 goto error;
7247 }
7248 out = PyBytes_AS_STRING(*outbytes) + offset;
7249 }
7250 kind = PyUnicode_KIND(rep);
7251 data = PyUnicode_DATA(rep);
7252 for (i=0; i < outsize; i++) {
7253 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7254 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007255 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007256 encoding, unicode,
7257 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 "unable to encode error handler result to ASCII");
7259 Py_DECREF(rep);
7260 goto error;
7261 }
7262 *out = (unsigned char)ch;
7263 out++;
7264 }
7265 }
7266 Py_DECREF(rep);
7267 }
7268 /* write a NUL byte */
7269 *out = 0;
7270 outsize = out - PyBytes_AS_STRING(*outbytes);
7271 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7272 if (_PyBytes_Resize(outbytes, outsize) < 0)
7273 goto error;
7274 ret = 0;
7275
7276error:
7277 Py_XDECREF(encoding_obj);
7278 Py_XDECREF(errorHandler);
7279 Py_XDECREF(exc);
7280 return ret;
7281}
7282
Victor Stinner3a50e702011-10-18 21:21:00 +02007283static PyObject *
7284encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007285 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007286 const char *errors)
7287{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007288 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007289 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007290 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007291 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007292
Benjamin Petersonbac79492012-01-14 13:34:47 -05007293 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007294 return NULL;
7295 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007296
Victor Stinner3a50e702011-10-18 21:21:00 +02007297 if (code_page < 0) {
7298 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7299 return NULL;
7300 }
7301
Martin v. Löwis3d325192011-11-04 18:23:06 +01007302 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007303 return PyBytes_FromStringAndSize(NULL, 0);
7304
Victor Stinner7581cef2011-11-03 22:32:33 +01007305 offset = 0;
7306 do
7307 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007308#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007309 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007310 chunks. */
7311 if (len > INT_MAX/2) {
7312 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007313 done = 0;
7314 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007315 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007316#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007317 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007318 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007319 done = 1;
7320 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007321
Victor Stinner76a31a62011-11-04 00:05:13 +01007322 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007323 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007324 errors);
7325 if (ret == -2)
7326 ret = encode_code_page_errors(code_page, &outbytes,
7327 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007328 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007329 if (ret < 0) {
7330 Py_XDECREF(outbytes);
7331 return NULL;
7332 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007333
Victor Stinner7581cef2011-11-03 22:32:33 +01007334 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007335 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007336 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 return outbytes;
7339}
7340
7341PyObject *
7342PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7343 Py_ssize_t size,
7344 const char *errors)
7345{
Victor Stinner7581cef2011-11-03 22:32:33 +01007346 PyObject *unicode, *res;
7347 unicode = PyUnicode_FromUnicode(p, size);
7348 if (unicode == NULL)
7349 return NULL;
7350 res = encode_code_page(CP_ACP, unicode, errors);
7351 Py_DECREF(unicode);
7352 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007353}
7354
7355PyObject *
7356PyUnicode_EncodeCodePage(int code_page,
7357 PyObject *unicode,
7358 const char *errors)
7359{
Victor Stinner7581cef2011-11-03 22:32:33 +01007360 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007361}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007362
Alexander Belopolsky40018472011-02-26 01:02:56 +00007363PyObject *
7364PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007365{
7366 if (!PyUnicode_Check(unicode)) {
7367 PyErr_BadArgument();
7368 return NULL;
7369 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007370 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007371}
7372
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007373#undef NEED_RETRY
7374
Victor Stinner99b95382011-07-04 14:23:54 +02007375#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007376
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377/* --- Character Mapping Codec -------------------------------------------- */
7378
Alexander Belopolsky40018472011-02-26 01:02:56 +00007379PyObject *
7380PyUnicode_DecodeCharmap(const char *s,
7381 Py_ssize_t size,
7382 PyObject *mapping,
7383 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007385 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007386 Py_ssize_t startinpos;
7387 Py_ssize_t endinpos;
7388 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007389 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007390 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007391 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007392 PyObject *errorHandler = NULL;
7393 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007394
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395 /* Default to Latin-1 */
7396 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007398
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007399 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007400 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007403 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007404 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007405 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007406 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007407 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007408 enum PyUnicode_Kind mapkind;
7409 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007410 Py_UCS4 x;
7411
Benjamin Petersonbac79492012-01-14 13:34:47 -05007412 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007413 return NULL;
7414
7415 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007416 mapdata = PyUnicode_DATA(mapping);
7417 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007419 unsigned char ch;
7420 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7421 enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7422 if (outkind == PyUnicode_1BYTE_KIND) {
7423 void *outdata = PyUnicode_DATA(v);
7424 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7425 while (s < e) {
7426 unsigned char ch = *s;
7427 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7428 if (x > maxchar)
7429 goto Error;
7430 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7431 ++s;
7432 }
7433 break;
7434 }
7435 else if (outkind == PyUnicode_2BYTE_KIND) {
7436 void *outdata = PyUnicode_DATA(v);
7437 while (s < e) {
7438 unsigned char ch = *s;
7439 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7440 if (x == 0xFFFE)
7441 goto Error;
7442 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7443 ++s;
7444 }
7445 break;
7446 }
7447 }
7448 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007449
Benjamin Peterson29060642009-01-31 22:14:21 +00007450 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007451 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007452 else
7453 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007454Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007455 if (x == 0xfffe)
7456 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007457 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007458 startinpos = s-starts;
7459 endinpos = startinpos+1;
7460 if (unicode_decode_call_errorhandler(
7461 errors, &errorHandler,
7462 "charmap", "character maps to <undefined>",
7463 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007464 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 goto onError;
7466 }
7467 continue;
7468 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007469
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007470 if (unicode_putchar(&v, &outpos, x) < 0)
7471 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007472 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007473 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007474 }
7475 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 while (s < e) {
7477 unsigned char ch = *s;
7478 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007479
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7481 w = PyLong_FromLong((long)ch);
7482 if (w == NULL)
7483 goto onError;
7484 x = PyObject_GetItem(mapping, w);
7485 Py_DECREF(w);
7486 if (x == NULL) {
7487 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7488 /* No mapping found means: mapping is undefined. */
7489 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007490 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007491 } else
7492 goto onError;
7493 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007494
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007496 if (x == Py_None)
7497 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007498 if (PyLong_Check(x)) {
7499 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007500 if (value == 0xFFFE)
7501 goto Undefined;
Antoine Pitroua1f76552012-09-23 20:00:04 +02007502 if (value < 0 || value > MAX_UNICODE) {
7503 PyErr_Format(PyExc_TypeError,
7504 "character mapping must be in range(0x%lx)",
7505 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 Py_DECREF(x);
7507 goto onError;
7508 }
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007509 if (unicode_putchar(&v, &outpos, value) < 0) {
7510 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007511 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007512 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007513 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007515 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007516
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007517 if (PyUnicode_READY(x) == -1) {
7518 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007519 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007520 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007521 targetsize = PyUnicode_GET_LENGTH(x);
7522
7523 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007524 /* 1-1 mapping */
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007525 Py_UCS4 value = PyUnicode_READ_CHAR(x, 0);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007526 if (value == 0xFFFE)
7527 goto Undefined;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007528 if (unicode_putchar(&v, &outpos, value) < 0) {
7529 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007530 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007531 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007532 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007533 else if (targetsize > 1) {
7534 /* 1-n mapping */
7535 if (targetsize > extrachars) {
7536 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007537 Py_ssize_t needed = (targetsize - extrachars) + \
7538 (targetsize << 2);
7539 extrachars += needed;
7540 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007541 if (unicode_resize(&v,
7542 PyUnicode_GET_LENGTH(v) + needed) < 0)
7543 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007544 Py_DECREF(x);
7545 goto onError;
7546 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007547 }
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007548 if (unicode_widen(&v, outpos,
7549 PyUnicode_MAX_CHAR_VALUE(x)) < 0) {
7550 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007551 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007552 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007553 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7554 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 extrachars -= targetsize;
7556 }
7557 /* 1-0 mapping: skip the character */
7558 }
7559 else {
7560 /* wrong return value */
7561 PyErr_SetString(PyExc_TypeError,
7562 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007563 Py_DECREF(x);
7564 goto onError;
7565 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007566 Py_DECREF(x);
7567 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007568 continue;
7569Undefined:
7570 /* undefined mapping */
7571 Py_XDECREF(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007572 startinpos = s-starts;
7573 endinpos = startinpos+1;
7574 if (unicode_decode_call_errorhandler(
7575 errors, &errorHandler,
7576 "charmap", "character maps to <undefined>",
7577 &starts, &e, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007578 &v, &outpos)) {
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007579 goto onError;
7580 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007582 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007583 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007584 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007585 Py_XDECREF(errorHandler);
7586 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007587 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007588
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007590 Py_XDECREF(errorHandler);
7591 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592 Py_XDECREF(v);
7593 return NULL;
7594}
7595
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007596/* Charmap encoding: the lookup table */
7597
Alexander Belopolsky40018472011-02-26 01:02:56 +00007598struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007599 PyObject_HEAD
7600 unsigned char level1[32];
7601 int count2, count3;
7602 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007603};
7604
7605static PyObject*
7606encoding_map_size(PyObject *obj, PyObject* args)
7607{
7608 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007609 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007611}
7612
7613static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007614 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007615 PyDoc_STR("Return the size (in bytes) of this object") },
7616 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007617};
7618
7619static void
7620encoding_map_dealloc(PyObject* o)
7621{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007622 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007623}
7624
7625static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007626 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007627 "EncodingMap", /*tp_name*/
7628 sizeof(struct encoding_map), /*tp_basicsize*/
7629 0, /*tp_itemsize*/
7630 /* methods */
7631 encoding_map_dealloc, /*tp_dealloc*/
7632 0, /*tp_print*/
7633 0, /*tp_getattr*/
7634 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007635 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007636 0, /*tp_repr*/
7637 0, /*tp_as_number*/
7638 0, /*tp_as_sequence*/
7639 0, /*tp_as_mapping*/
7640 0, /*tp_hash*/
7641 0, /*tp_call*/
7642 0, /*tp_str*/
7643 0, /*tp_getattro*/
7644 0, /*tp_setattro*/
7645 0, /*tp_as_buffer*/
7646 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7647 0, /*tp_doc*/
7648 0, /*tp_traverse*/
7649 0, /*tp_clear*/
7650 0, /*tp_richcompare*/
7651 0, /*tp_weaklistoffset*/
7652 0, /*tp_iter*/
7653 0, /*tp_iternext*/
7654 encoding_map_methods, /*tp_methods*/
7655 0, /*tp_members*/
7656 0, /*tp_getset*/
7657 0, /*tp_base*/
7658 0, /*tp_dict*/
7659 0, /*tp_descr_get*/
7660 0, /*tp_descr_set*/
7661 0, /*tp_dictoffset*/
7662 0, /*tp_init*/
7663 0, /*tp_alloc*/
7664 0, /*tp_new*/
7665 0, /*tp_free*/
7666 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007667};
7668
7669PyObject*
7670PyUnicode_BuildEncodingMap(PyObject* string)
7671{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007672 PyObject *result;
7673 struct encoding_map *mresult;
7674 int i;
7675 int need_dict = 0;
7676 unsigned char level1[32];
7677 unsigned char level2[512];
7678 unsigned char *mlevel1, *mlevel2, *mlevel3;
7679 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007680 int kind;
7681 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007682 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007683 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007684
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007685 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007686 PyErr_BadArgument();
7687 return NULL;
7688 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007689 kind = PyUnicode_KIND(string);
7690 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007691 length = PyUnicode_GET_LENGTH(string);
7692 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007693 memset(level1, 0xFF, sizeof level1);
7694 memset(level2, 0xFF, sizeof level2);
7695
7696 /* If there isn't a one-to-one mapping of NULL to \0,
7697 or if there are non-BMP characters, we need to use
7698 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007699 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007700 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007701 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007702 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007703 ch = PyUnicode_READ(kind, data, i);
7704 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007705 need_dict = 1;
7706 break;
7707 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007708 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007709 /* unmapped character */
7710 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007711 l1 = ch >> 11;
7712 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007713 if (level1[l1] == 0xFF)
7714 level1[l1] = count2++;
7715 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007716 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007717 }
7718
7719 if (count2 >= 0xFF || count3 >= 0xFF)
7720 need_dict = 1;
7721
7722 if (need_dict) {
7723 PyObject *result = PyDict_New();
7724 PyObject *key, *value;
7725 if (!result)
7726 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007727 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007728 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007729 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007730 if (!key || !value)
7731 goto failed1;
7732 if (PyDict_SetItem(result, key, value) == -1)
7733 goto failed1;
7734 Py_DECREF(key);
7735 Py_DECREF(value);
7736 }
7737 return result;
7738 failed1:
7739 Py_XDECREF(key);
7740 Py_XDECREF(value);
7741 Py_DECREF(result);
7742 return NULL;
7743 }
7744
7745 /* Create a three-level trie */
7746 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7747 16*count2 + 128*count3 - 1);
7748 if (!result)
7749 return PyErr_NoMemory();
7750 PyObject_Init(result, &EncodingMapType);
7751 mresult = (struct encoding_map*)result;
7752 mresult->count2 = count2;
7753 mresult->count3 = count3;
7754 mlevel1 = mresult->level1;
7755 mlevel2 = mresult->level23;
7756 mlevel3 = mresult->level23 + 16*count2;
7757 memcpy(mlevel1, level1, 32);
7758 memset(mlevel2, 0xFF, 16*count2);
7759 memset(mlevel3, 0, 128*count3);
7760 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007761 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007762 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007763 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7764 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007765 /* unmapped character */
7766 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007767 o1 = ch>>11;
7768 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007769 i2 = 16*mlevel1[o1] + o2;
7770 if (mlevel2[i2] == 0xFF)
7771 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007772 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007773 i3 = 128*mlevel2[i2] + o3;
7774 mlevel3[i3] = i;
7775 }
7776 return result;
7777}
7778
7779static int
Victor Stinner22168992011-11-20 17:09:18 +01007780encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007781{
7782 struct encoding_map *map = (struct encoding_map*)mapping;
7783 int l1 = c>>11;
7784 int l2 = (c>>7) & 0xF;
7785 int l3 = c & 0x7F;
7786 int i;
7787
Victor Stinner22168992011-11-20 17:09:18 +01007788 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007790 if (c == 0)
7791 return 0;
7792 /* level 1*/
7793 i = map->level1[l1];
7794 if (i == 0xFF) {
7795 return -1;
7796 }
7797 /* level 2*/
7798 i = map->level23[16*i+l2];
7799 if (i == 0xFF) {
7800 return -1;
7801 }
7802 /* level 3 */
7803 i = map->level23[16*map->count2 + 128*i + l3];
7804 if (i == 0) {
7805 return -1;
7806 }
7807 return i;
7808}
7809
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007810/* Lookup the character ch in the mapping. If the character
7811 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007812 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007813static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007814charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007815{
Christian Heimes217cfd12007-12-02 14:31:20 +00007816 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007817 PyObject *x;
7818
7819 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007821 x = PyObject_GetItem(mapping, w);
7822 Py_DECREF(w);
7823 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7825 /* No mapping found means: mapping is undefined. */
7826 PyErr_Clear();
7827 x = Py_None;
7828 Py_INCREF(x);
7829 return x;
7830 } else
7831 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007832 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007833 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007835 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 long value = PyLong_AS_LONG(x);
7837 if (value < 0 || value > 255) {
7838 PyErr_SetString(PyExc_TypeError,
7839 "character mapping must be in range(256)");
7840 Py_DECREF(x);
7841 return NULL;
7842 }
7843 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007845 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007846 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007847 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 /* wrong return value */
7849 PyErr_Format(PyExc_TypeError,
7850 "character mapping must return integer, bytes or None, not %.400s",
7851 x->ob_type->tp_name);
7852 Py_DECREF(x);
7853 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007854 }
7855}
7856
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007857static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007858charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007859{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007860 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7861 /* exponentially overallocate to minimize reallocations */
7862 if (requiredsize < 2*outsize)
7863 requiredsize = 2*outsize;
7864 if (_PyBytes_Resize(outobj, requiredsize))
7865 return -1;
7866 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007867}
7868
Benjamin Peterson14339b62009-01-31 16:36:08 +00007869typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007870 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007871} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007872/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007873 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007874 space is available. Return a new reference to the object that
7875 was put in the output buffer, or Py_None, if the mapping was undefined
7876 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007877 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007878static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007879charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007880 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007881{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007882 PyObject *rep;
7883 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007884 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007885
Christian Heimes90aa7642007-12-19 02:45:37 +00007886 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007887 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007888 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007889 if (res == -1)
7890 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007891 if (outsize<requiredsize)
7892 if (charmapencode_resize(outobj, outpos, requiredsize))
7893 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007894 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 outstart[(*outpos)++] = (char)res;
7896 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007897 }
7898
7899 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007900 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007902 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007903 Py_DECREF(rep);
7904 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007905 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007906 if (PyLong_Check(rep)) {
7907 Py_ssize_t requiredsize = *outpos+1;
7908 if (outsize<requiredsize)
7909 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7910 Py_DECREF(rep);
7911 return enc_EXCEPTION;
7912 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007913 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007914 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007915 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 else {
7917 const char *repchars = PyBytes_AS_STRING(rep);
7918 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7919 Py_ssize_t requiredsize = *outpos+repsize;
7920 if (outsize<requiredsize)
7921 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7922 Py_DECREF(rep);
7923 return enc_EXCEPTION;
7924 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007925 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 memcpy(outstart + *outpos, repchars, repsize);
7927 *outpos += repsize;
7928 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007929 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007930 Py_DECREF(rep);
7931 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007932}
7933
7934/* handle an error in PyUnicode_EncodeCharmap
7935 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007936static int
7937charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007938 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007939 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007940 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007941 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007942{
7943 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007944 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007945 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007946 enum PyUnicode_Kind kind;
7947 void *data;
7948 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007949 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007950 Py_ssize_t collstartpos = *inpos;
7951 Py_ssize_t collendpos = *inpos+1;
7952 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007953 char *encoding = "charmap";
7954 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007955 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007956 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007957 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007958
Benjamin Petersonbac79492012-01-14 13:34:47 -05007959 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007960 return -1;
7961 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007962 /* find all unencodable characters */
7963 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007964 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007965 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007966 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007967 val = encoding_map_lookup(ch, mapping);
7968 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007969 break;
7970 ++collendpos;
7971 continue;
7972 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007973
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007974 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7975 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 if (rep==NULL)
7977 return -1;
7978 else if (rep!=Py_None) {
7979 Py_DECREF(rep);
7980 break;
7981 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007982 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007984 }
7985 /* cache callback name lookup
7986 * (if not done yet, i.e. it's the first error) */
7987 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 if ((errors==NULL) || (!strcmp(errors, "strict")))
7989 *known_errorHandler = 1;
7990 else if (!strcmp(errors, "replace"))
7991 *known_errorHandler = 2;
7992 else if (!strcmp(errors, "ignore"))
7993 *known_errorHandler = 3;
7994 else if (!strcmp(errors, "xmlcharrefreplace"))
7995 *known_errorHandler = 4;
7996 else
7997 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007998 }
7999 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008000 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008001 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008002 return -1;
8003 case 2: /* replace */
8004 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008005 x = charmapencode_output('?', mapping, res, respos);
8006 if (x==enc_EXCEPTION) {
8007 return -1;
8008 }
8009 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008010 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 return -1;
8012 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008013 }
8014 /* fall through */
8015 case 3: /* ignore */
8016 *inpos = collendpos;
8017 break;
8018 case 4: /* xmlcharrefreplace */
8019 /* generate replacement (temporarily (mis)uses p) */
8020 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 char buffer[2+29+1+1];
8022 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008023 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008024 for (cp = buffer; *cp; ++cp) {
8025 x = charmapencode_output(*cp, mapping, res, respos);
8026 if (x==enc_EXCEPTION)
8027 return -1;
8028 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008029 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008030 return -1;
8031 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008032 }
8033 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008034 *inpos = collendpos;
8035 break;
8036 default:
8037 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008038 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008040 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008041 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008042 if (PyBytes_Check(repunicode)) {
8043 /* Directly copy bytes result to output. */
8044 Py_ssize_t outsize = PyBytes_Size(*res);
8045 Py_ssize_t requiredsize;
8046 repsize = PyBytes_Size(repunicode);
8047 requiredsize = *respos + repsize;
8048 if (requiredsize > outsize)
8049 /* Make room for all additional bytes. */
8050 if (charmapencode_resize(res, respos, requiredsize)) {
8051 Py_DECREF(repunicode);
8052 return -1;
8053 }
8054 memcpy(PyBytes_AsString(*res) + *respos,
8055 PyBytes_AsString(repunicode), repsize);
8056 *respos += repsize;
8057 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008058 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008059 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008060 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008061 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008062 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008063 Py_DECREF(repunicode);
8064 return -1;
8065 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008066 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008067 data = PyUnicode_DATA(repunicode);
8068 kind = PyUnicode_KIND(repunicode);
8069 for (index = 0; index < repsize; index++) {
8070 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8071 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008073 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 return -1;
8075 }
8076 else if (x==enc_FAILED) {
8077 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008078 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008079 return -1;
8080 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008081 }
8082 *inpos = newpos;
8083 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008084 }
8085 return 0;
8086}
8087
Alexander Belopolsky40018472011-02-26 01:02:56 +00008088PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008089_PyUnicode_EncodeCharmap(PyObject *unicode,
8090 PyObject *mapping,
8091 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008092{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008093 /* output object */
8094 PyObject *res = NULL;
8095 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008096 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008097 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008098 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008099 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008100 PyObject *errorHandler = NULL;
8101 PyObject *exc = NULL;
8102 /* the following variable is used for caching string comparisons
8103 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8104 * 3=ignore, 4=xmlcharrefreplace */
8105 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008106
Benjamin Petersonbac79492012-01-14 13:34:47 -05008107 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008108 return NULL;
8109 size = PyUnicode_GET_LENGTH(unicode);
8110
Guido van Rossumd57fd912000-03-10 22:53:23 +00008111 /* Default to Latin-1 */
8112 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008113 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008114
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008115 /* allocate enough for a simple encoding without
8116 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008117 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008118 if (res == NULL)
8119 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008120 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008122
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008123 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008124 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008126 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 if (x==enc_EXCEPTION) /* error */
8128 goto onError;
8129 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008130 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 &exc,
8132 &known_errorHandler, &errorHandler, errors,
8133 &res, &respos)) {
8134 goto onError;
8135 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008136 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 else
8138 /* done with this character => adjust input position */
8139 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008142 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008143 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008144 if (_PyBytes_Resize(&res, respos) < 0)
8145 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008146
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008147 Py_XDECREF(exc);
8148 Py_XDECREF(errorHandler);
8149 return res;
8150
Benjamin Peterson29060642009-01-31 22:14:21 +00008151 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008152 Py_XDECREF(res);
8153 Py_XDECREF(exc);
8154 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008155 return NULL;
8156}
8157
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008158/* Deprecated */
8159PyObject *
8160PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8161 Py_ssize_t size,
8162 PyObject *mapping,
8163 const char *errors)
8164{
8165 PyObject *result;
8166 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8167 if (unicode == NULL)
8168 return NULL;
8169 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8170 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008171 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008172}
8173
Alexander Belopolsky40018472011-02-26 01:02:56 +00008174PyObject *
8175PyUnicode_AsCharmapString(PyObject *unicode,
8176 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177{
8178 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 PyErr_BadArgument();
8180 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008182 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008183}
8184
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008185/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008186static void
8187make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008188 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008189 Py_ssize_t startpos, Py_ssize_t endpos,
8190 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008191{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008192 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008193 *exceptionObject = _PyUnicodeTranslateError_Create(
8194 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195 }
8196 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8198 goto onError;
8199 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8200 goto onError;
8201 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8202 goto onError;
8203 return;
8204 onError:
8205 Py_DECREF(*exceptionObject);
8206 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008207 }
8208}
8209
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008210/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008211static void
8212raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008213 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008214 Py_ssize_t startpos, Py_ssize_t endpos,
8215 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008216{
8217 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008218 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008219 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008221}
8222
8223/* error handling callback helper:
8224 build arguments, call the callback and check the arguments,
8225 put the result into newpos and return the replacement string, which
8226 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008227static PyObject *
8228unicode_translate_call_errorhandler(const char *errors,
8229 PyObject **errorHandler,
8230 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008231 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008232 Py_ssize_t startpos, Py_ssize_t endpos,
8233 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008234{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008235 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008236
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008237 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008238 PyObject *restuple;
8239 PyObject *resunicode;
8240
8241 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008245 }
8246
8247 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008248 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008251
8252 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008254 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008256 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008257 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 Py_DECREF(restuple);
8259 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 }
8261 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 &resunicode, &i_newpos)) {
8263 Py_DECREF(restuple);
8264 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008265 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008266 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008267 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008268 else
8269 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008270 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008271 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8272 Py_DECREF(restuple);
8273 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008274 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008275 Py_INCREF(resunicode);
8276 Py_DECREF(restuple);
8277 return resunicode;
8278}
8279
8280/* Lookup the character ch in the mapping and put the result in result,
8281 which must be decrefed by the caller.
8282 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008283static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008284charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008285{
Christian Heimes217cfd12007-12-02 14:31:20 +00008286 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008287 PyObject *x;
8288
8289 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008290 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008291 x = PyObject_GetItem(mapping, w);
8292 Py_DECREF(w);
8293 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8295 /* No mapping found means: use 1:1 mapping. */
8296 PyErr_Clear();
8297 *result = NULL;
8298 return 0;
8299 } else
8300 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008301 }
8302 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 *result = x;
8304 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008306 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 long value = PyLong_AS_LONG(x);
8308 long max = PyUnicode_GetMax();
8309 if (value < 0 || value > max) {
8310 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008311 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 Py_DECREF(x);
8313 return -1;
8314 }
8315 *result = x;
8316 return 0;
8317 }
8318 else if (PyUnicode_Check(x)) {
8319 *result = x;
8320 return 0;
8321 }
8322 else {
8323 /* wrong return value */
8324 PyErr_SetString(PyExc_TypeError,
8325 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008326 Py_DECREF(x);
8327 return -1;
8328 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008329}
8330/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 if not reallocate and adjust various state variables.
8332 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008333static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008334charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008336{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008337 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008338 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008339 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 /* exponentially overallocate to minimize reallocations */
8341 if (requiredsize < 2 * oldsize)
8342 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008343 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8344 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008346 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008347 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008348 }
8349 return 0;
8350}
8351/* lookup the character, put the result in the output string and adjust
8352 various state variables. Return a new reference to the object that
8353 was put in the output buffer in *result, or Py_None, if the mapping was
8354 undefined (in which case no character was written).
8355 The called must decref result.
8356 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008357static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008358charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8359 PyObject *mapping, Py_UCS4 **output,
8360 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008361 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008363 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8364 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008368 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369 }
8370 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008372 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008374 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008375 }
8376 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 Py_ssize_t repsize;
8378 if (PyUnicode_READY(*res) == -1)
8379 return -1;
8380 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 if (repsize==1) {
8382 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008383 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 }
8385 else if (repsize!=0) {
8386 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 Py_ssize_t requiredsize = *opos +
8388 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008390 Py_ssize_t i;
8391 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008393 for(i = 0; i < repsize; i++)
8394 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396 }
8397 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008399 return 0;
8400}
8401
Alexander Belopolsky40018472011-02-26 01:02:56 +00008402PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008403_PyUnicode_TranslateCharmap(PyObject *input,
8404 PyObject *mapping,
8405 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008406{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008407 /* input object */
8408 char *idata;
8409 Py_ssize_t size, i;
8410 int kind;
8411 /* output buffer */
8412 Py_UCS4 *output = NULL;
8413 Py_ssize_t osize;
8414 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008416 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417 char *reason = "character maps to <undefined>";
8418 PyObject *errorHandler = NULL;
8419 PyObject *exc = NULL;
8420 /* the following variable is used for caching string comparisons
8421 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8422 * 3=ignore, 4=xmlcharrefreplace */
8423 int known_errorHandler = -1;
8424
Guido van Rossumd57fd912000-03-10 22:53:23 +00008425 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 PyErr_BadArgument();
8427 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008428 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008430 if (PyUnicode_READY(input) == -1)
8431 return NULL;
8432 idata = (char*)PyUnicode_DATA(input);
8433 kind = PyUnicode_KIND(input);
8434 size = PyUnicode_GET_LENGTH(input);
8435 i = 0;
8436
8437 if (size == 0) {
8438 Py_INCREF(input);
8439 return input;
8440 }
8441
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008442 /* allocate enough for a simple 1:1 translation without
8443 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008444 osize = size;
8445 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8446 opos = 0;
8447 if (output == NULL) {
8448 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008450 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 /* try to encode it */
8454 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455 if (charmaptranslate_output(input, i, mapping,
8456 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 Py_XDECREF(x);
8458 goto onError;
8459 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008460 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008462 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 else { /* untranslatable character */
8464 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8465 Py_ssize_t repsize;
8466 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008467 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008469 Py_ssize_t collstart = i;
8470 Py_ssize_t collend = i+1;
8471 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008472
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008474 while (collend < size) {
8475 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 goto onError;
8477 Py_XDECREF(x);
8478 if (x!=Py_None)
8479 break;
8480 ++collend;
8481 }
8482 /* cache callback name lookup
8483 * (if not done yet, i.e. it's the first error) */
8484 if (known_errorHandler==-1) {
8485 if ((errors==NULL) || (!strcmp(errors, "strict")))
8486 known_errorHandler = 1;
8487 else if (!strcmp(errors, "replace"))
8488 known_errorHandler = 2;
8489 else if (!strcmp(errors, "ignore"))
8490 known_errorHandler = 3;
8491 else if (!strcmp(errors, "xmlcharrefreplace"))
8492 known_errorHandler = 4;
8493 else
8494 known_errorHandler = 0;
8495 }
8496 switch (known_errorHandler) {
8497 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498 raise_translate_exception(&exc, input, collstart,
8499 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008500 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 case 2: /* replace */
8502 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008503 for (coll = collstart; coll<collend; coll++)
8504 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 /* fall through */
8506 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 break;
8509 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008510 /* generate replacement (temporarily (mis)uses i) */
8511 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 char buffer[2+29+1+1];
8513 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8515 if (charmaptranslate_makespace(&output, &osize,
8516 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 goto onError;
8518 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008519 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 break;
8523 default:
8524 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008525 reason, input, &exc,
8526 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008527 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008529 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008530 Py_DECREF(repunicode);
8531 goto onError;
8532 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008533 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008534 repsize = PyUnicode_GET_LENGTH(repunicode);
8535 if (charmaptranslate_makespace(&output, &osize,
8536 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 Py_DECREF(repunicode);
8538 goto onError;
8539 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 for (uni2 = 0; repsize-->0; ++uni2)
8541 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8542 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008544 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008545 }
8546 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008547 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8548 if (!res)
8549 goto onError;
8550 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008551 Py_XDECREF(exc);
8552 Py_XDECREF(errorHandler);
8553 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008554
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008556 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557 Py_XDECREF(exc);
8558 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008559 return NULL;
8560}
8561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008562/* Deprecated. Use PyUnicode_Translate instead. */
8563PyObject *
8564PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8565 Py_ssize_t size,
8566 PyObject *mapping,
8567 const char *errors)
8568{
Christian Heimes5f520f42012-09-11 14:03:25 +02008569 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8571 if (!unicode)
8572 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008573 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8574 Py_DECREF(unicode);
8575 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008576}
8577
Alexander Belopolsky40018472011-02-26 01:02:56 +00008578PyObject *
8579PyUnicode_Translate(PyObject *str,
8580 PyObject *mapping,
8581 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582{
8583 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008584
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 str = PyUnicode_FromObject(str);
8586 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008587 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008588 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589 Py_DECREF(str);
8590 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591}
Tim Petersced69f82003-09-16 20:30:58 +00008592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008594fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595{
8596 /* No need to call PyUnicode_READY(self) because this function is only
8597 called as a callback from fixup() which does it already. */
8598 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8599 const int kind = PyUnicode_KIND(self);
8600 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008601 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008602 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008603 Py_ssize_t i;
8604
8605 for (i = 0; i < len; ++i) {
8606 ch = PyUnicode_READ(kind, data, i);
8607 fixed = 0;
8608 if (ch > 127) {
8609 if (Py_UNICODE_ISSPACE(ch))
8610 fixed = ' ';
8611 else {
8612 const int decimal = Py_UNICODE_TODECIMAL(ch);
8613 if (decimal >= 0)
8614 fixed = '0' + decimal;
8615 }
8616 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008617 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008618 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619 PyUnicode_WRITE(kind, data, i, fixed);
8620 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008621 else
8622 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008624 }
8625
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008626 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627}
8628
8629PyObject *
8630_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8631{
8632 if (!PyUnicode_Check(unicode)) {
8633 PyErr_BadInternalCall();
8634 return NULL;
8635 }
8636 if (PyUnicode_READY(unicode) == -1)
8637 return NULL;
8638 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8639 /* If the string is already ASCII, just return the same string */
8640 Py_INCREF(unicode);
8641 return unicode;
8642 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008643 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644}
8645
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008646PyObject *
8647PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8648 Py_ssize_t length)
8649{
Victor Stinnerf0124502011-11-21 23:12:56 +01008650 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008651 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008652 Py_UCS4 maxchar;
8653 enum PyUnicode_Kind kind;
8654 void *data;
8655
Victor Stinner99d7ad02012-02-22 13:37:39 +01008656 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008657 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008658 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008659 if (ch > 127) {
8660 int decimal = Py_UNICODE_TODECIMAL(ch);
8661 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008662 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008663 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008664 }
8665 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008666
8667 /* Copy to a new string */
8668 decimal = PyUnicode_New(length, maxchar);
8669 if (decimal == NULL)
8670 return decimal;
8671 kind = PyUnicode_KIND(decimal);
8672 data = PyUnicode_DATA(decimal);
8673 /* Iterate over code points */
8674 for (i = 0; i < length; i++) {
8675 Py_UNICODE ch = s[i];
8676 if (ch > 127) {
8677 int decimal = Py_UNICODE_TODECIMAL(ch);
8678 if (decimal >= 0)
8679 ch = '0' + decimal;
8680 }
8681 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008682 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008683 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008684}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008685/* --- Decimal Encoder ---------------------------------------------------- */
8686
Alexander Belopolsky40018472011-02-26 01:02:56 +00008687int
8688PyUnicode_EncodeDecimal(Py_UNICODE *s,
8689 Py_ssize_t length,
8690 char *output,
8691 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008692{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008693 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008694 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008695 enum PyUnicode_Kind kind;
8696 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008697
8698 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 PyErr_BadArgument();
8700 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008701 }
8702
Victor Stinner42bf7752011-11-21 22:52:58 +01008703 unicode = PyUnicode_FromUnicode(s, length);
8704 if (unicode == NULL)
8705 return -1;
8706
Benjamin Petersonbac79492012-01-14 13:34:47 -05008707 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008708 Py_DECREF(unicode);
8709 return -1;
8710 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008711 kind = PyUnicode_KIND(unicode);
8712 data = PyUnicode_DATA(unicode);
8713
Victor Stinnerb84d7232011-11-22 01:50:07 +01008714 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008715 PyObject *exc;
8716 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008718 Py_ssize_t startpos;
8719
8720 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008721
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008723 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008724 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008726 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 decimal = Py_UNICODE_TODECIMAL(ch);
8728 if (decimal >= 0) {
8729 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008730 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 continue;
8732 }
8733 if (0 < ch && ch < 256) {
8734 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008735 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 continue;
8737 }
Victor Stinner6345be92011-11-25 20:09:01 +01008738
Victor Stinner42bf7752011-11-21 22:52:58 +01008739 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008740 exc = NULL;
8741 raise_encode_exception(&exc, "decimal", unicode,
8742 startpos, startpos+1,
8743 "invalid decimal Unicode string");
8744 Py_XDECREF(exc);
8745 Py_DECREF(unicode);
8746 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008747 }
8748 /* 0-terminate the output string */
8749 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008750 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008751 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008752}
8753
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754/* --- Helpers ------------------------------------------------------------ */
8755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008757any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008758 Py_ssize_t start,
8759 Py_ssize_t end)
8760{
8761 int kind1, kind2, kind;
8762 void *buf1, *buf2;
8763 Py_ssize_t len1, len2, result;
8764
8765 kind1 = PyUnicode_KIND(s1);
8766 kind2 = PyUnicode_KIND(s2);
8767 kind = kind1 > kind2 ? kind1 : kind2;
8768 buf1 = PyUnicode_DATA(s1);
8769 buf2 = PyUnicode_DATA(s2);
8770 if (kind1 != kind)
8771 buf1 = _PyUnicode_AsKind(s1, kind);
8772 if (!buf1)
8773 return -2;
8774 if (kind2 != kind)
8775 buf2 = _PyUnicode_AsKind(s2, kind);
8776 if (!buf2) {
8777 if (kind1 != kind) PyMem_Free(buf1);
8778 return -2;
8779 }
8780 len1 = PyUnicode_GET_LENGTH(s1);
8781 len2 = PyUnicode_GET_LENGTH(s2);
8782
Victor Stinner794d5672011-10-10 03:21:36 +02008783 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008784 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008785 case PyUnicode_1BYTE_KIND:
8786 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8787 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8788 else
8789 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8790 break;
8791 case PyUnicode_2BYTE_KIND:
8792 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8793 break;
8794 case PyUnicode_4BYTE_KIND:
8795 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8796 break;
8797 default:
8798 assert(0); result = -2;
8799 }
8800 }
8801 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008802 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008803 case PyUnicode_1BYTE_KIND:
8804 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8805 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8806 else
8807 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8808 break;
8809 case PyUnicode_2BYTE_KIND:
8810 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8811 break;
8812 case PyUnicode_4BYTE_KIND:
8813 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8814 break;
8815 default:
8816 assert(0); result = -2;
8817 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008818 }
8819
8820 if (kind1 != kind)
8821 PyMem_Free(buf1);
8822 if (kind2 != kind)
8823 PyMem_Free(buf2);
8824
8825 return result;
8826}
8827
8828Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008829_PyUnicode_InsertThousandsGrouping(
8830 PyObject *unicode, Py_ssize_t index,
8831 Py_ssize_t n_buffer,
8832 void *digits, Py_ssize_t n_digits,
8833 Py_ssize_t min_width,
8834 const char *grouping, PyObject *thousands_sep,
8835 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836{
Victor Stinner41a863c2012-02-24 00:37:51 +01008837 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008838 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008839 Py_ssize_t thousands_sep_len;
8840 Py_ssize_t len;
8841
8842 if (unicode != NULL) {
8843 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008844 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008845 }
8846 else {
8847 kind = PyUnicode_1BYTE_KIND;
8848 data = NULL;
8849 }
8850 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8851 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8852 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8853 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008854 if (thousands_sep_kind < kind) {
8855 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8856 if (!thousands_sep_data)
8857 return -1;
8858 }
8859 else {
8860 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8861 if (!data)
8862 return -1;
8863 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008864 }
8865
Benjamin Petersonead6b532011-12-20 17:23:42 -06008866 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008867 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008868 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008869 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008870 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008871 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008872 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008873 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008874 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008875 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008876 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008877 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008878 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008879 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008880 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008881 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008882 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008883 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008884 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008886 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008887 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008888 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008889 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008890 break;
8891 default:
8892 assert(0);
8893 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008894 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008895 if (unicode != NULL && thousands_sep_kind != kind) {
8896 if (thousands_sep_kind < kind)
8897 PyMem_Free(thousands_sep_data);
8898 else
8899 PyMem_Free(data);
8900 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008901 if (unicode == NULL) {
8902 *maxchar = 127;
8903 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008904 *maxchar = MAX_MAXCHAR(*maxchar,
8905 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008906 }
8907 }
8908 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008909}
8910
8911
Thomas Wouters477c8d52006-05-27 19:21:47 +00008912/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008913#define ADJUST_INDICES(start, end, len) \
8914 if (end > len) \
8915 end = len; \
8916 else if (end < 0) { \
8917 end += len; \
8918 if (end < 0) \
8919 end = 0; \
8920 } \
8921 if (start < 0) { \
8922 start += len; \
8923 if (start < 0) \
8924 start = 0; \
8925 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008926
Alexander Belopolsky40018472011-02-26 01:02:56 +00008927Py_ssize_t
8928PyUnicode_Count(PyObject *str,
8929 PyObject *substr,
8930 Py_ssize_t start,
8931 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008932{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008933 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008934 PyObject* str_obj;
8935 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008936 int kind1, kind2, kind;
8937 void *buf1 = NULL, *buf2 = NULL;
8938 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008939
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008940 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008941 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008943 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008944 if (!sub_obj) {
8945 Py_DECREF(str_obj);
8946 return -1;
8947 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008948 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008949 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008950 Py_DECREF(str_obj);
8951 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952 }
Tim Petersced69f82003-09-16 20:30:58 +00008953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008954 kind1 = PyUnicode_KIND(str_obj);
8955 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008956 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008957 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008959 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008960 if (kind2 > kind) {
8961 Py_DECREF(sub_obj);
8962 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008963 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008964 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008965 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008966 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008967 if (!buf2)
8968 goto onError;
8969 len1 = PyUnicode_GET_LENGTH(str_obj);
8970 len2 = PyUnicode_GET_LENGTH(sub_obj);
8971
8972 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008973 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008975 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8976 result = asciilib_count(
8977 ((Py_UCS1*)buf1) + start, end - start,
8978 buf2, len2, PY_SSIZE_T_MAX
8979 );
8980 else
8981 result = ucs1lib_count(
8982 ((Py_UCS1*)buf1) + start, end - start,
8983 buf2, len2, PY_SSIZE_T_MAX
8984 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 break;
8986 case PyUnicode_2BYTE_KIND:
8987 result = ucs2lib_count(
8988 ((Py_UCS2*)buf1) + start, end - start,
8989 buf2, len2, PY_SSIZE_T_MAX
8990 );
8991 break;
8992 case PyUnicode_4BYTE_KIND:
8993 result = ucs4lib_count(
8994 ((Py_UCS4*)buf1) + start, end - start,
8995 buf2, len2, PY_SSIZE_T_MAX
8996 );
8997 break;
8998 default:
8999 assert(0); result = 0;
9000 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009001
9002 Py_DECREF(sub_obj);
9003 Py_DECREF(str_obj);
9004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009005 if (kind2 != kind)
9006 PyMem_Free(buf2);
9007
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 onError:
9010 Py_DECREF(sub_obj);
9011 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009012 if (kind2 != kind && buf2)
9013 PyMem_Free(buf2);
9014 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015}
9016
Alexander Belopolsky40018472011-02-26 01:02:56 +00009017Py_ssize_t
9018PyUnicode_Find(PyObject *str,
9019 PyObject *sub,
9020 Py_ssize_t start,
9021 Py_ssize_t end,
9022 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009024 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009025
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009027 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009028 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009029 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009030 if (!sub) {
9031 Py_DECREF(str);
9032 return -2;
9033 }
9034 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9035 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 Py_DECREF(str);
9037 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038 }
Tim Petersced69f82003-09-16 20:30:58 +00009039
Victor Stinner794d5672011-10-10 03:21:36 +02009040 result = any_find_slice(direction,
9041 str, sub, start, end
9042 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009043
Guido van Rossumd57fd912000-03-10 22:53:23 +00009044 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009045 Py_DECREF(sub);
9046
Guido van Rossumd57fd912000-03-10 22:53:23 +00009047 return result;
9048}
9049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050Py_ssize_t
9051PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9052 Py_ssize_t start, Py_ssize_t end,
9053 int direction)
9054{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009056 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 if (PyUnicode_READY(str) == -1)
9058 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009059 if (start < 0 || end < 0) {
9060 PyErr_SetString(PyExc_IndexError, "string index out of range");
9061 return -2;
9062 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009063 if (end > PyUnicode_GET_LENGTH(str))
9064 end = PyUnicode_GET_LENGTH(str);
9065 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009066 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9067 kind, end-start, ch, direction);
9068 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009070 else
9071 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009072}
9073
Alexander Belopolsky40018472011-02-26 01:02:56 +00009074static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009075tailmatch(PyObject *self,
9076 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009077 Py_ssize_t start,
9078 Py_ssize_t end,
9079 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009081 int kind_self;
9082 int kind_sub;
9083 void *data_self;
9084 void *data_sub;
9085 Py_ssize_t offset;
9086 Py_ssize_t i;
9087 Py_ssize_t end_sub;
9088
9089 if (PyUnicode_READY(self) == -1 ||
9090 PyUnicode_READY(substring) == -1)
9091 return 0;
9092
9093 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094 return 1;
9095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9097 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009099 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009100
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101 kind_self = PyUnicode_KIND(self);
9102 data_self = PyUnicode_DATA(self);
9103 kind_sub = PyUnicode_KIND(substring);
9104 data_sub = PyUnicode_DATA(substring);
9105 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9106
9107 if (direction > 0)
9108 offset = end;
9109 else
9110 offset = start;
9111
9112 if (PyUnicode_READ(kind_self, data_self, offset) ==
9113 PyUnicode_READ(kind_sub, data_sub, 0) &&
9114 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9115 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9116 /* If both are of the same kind, memcmp is sufficient */
9117 if (kind_self == kind_sub) {
9118 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009119 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120 data_sub,
9121 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009122 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009123 }
9124 /* otherwise we have to compare each character by first accesing it */
9125 else {
9126 /* We do not need to compare 0 and len(substring)-1 because
9127 the if statement above ensured already that they are equal
9128 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02009129 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009130 for (i = 1; i < end_sub; ++i) {
9131 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9132 PyUnicode_READ(kind_sub, data_sub, i))
9133 return 0;
9134 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137 }
9138
9139 return 0;
9140}
9141
Alexander Belopolsky40018472011-02-26 01:02:56 +00009142Py_ssize_t
9143PyUnicode_Tailmatch(PyObject *str,
9144 PyObject *substr,
9145 Py_ssize_t start,
9146 Py_ssize_t end,
9147 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009148{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009149 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009150
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151 str = PyUnicode_FromObject(str);
9152 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009153 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154 substr = PyUnicode_FromObject(substr);
9155 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009156 Py_DECREF(str);
9157 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158 }
Tim Petersced69f82003-09-16 20:30:58 +00009159
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009160 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009161 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009162 Py_DECREF(str);
9163 Py_DECREF(substr);
9164 return result;
9165}
9166
Guido van Rossumd57fd912000-03-10 22:53:23 +00009167/* Apply fixfct filter to the Unicode object self and return a
9168 reference to the modified object */
9169
Alexander Belopolsky40018472011-02-26 01:02:56 +00009170static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009171fixup(PyObject *self,
9172 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174 PyObject *u;
9175 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009176 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009177
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009178 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009179 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009180 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009181 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009183 /* fix functions return the new maximum character in a string,
9184 if the kind of the resulting unicode object does not change,
9185 everything is fine. Otherwise we need to change the string kind
9186 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009187 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009188
9189 if (maxchar_new == 0) {
9190 /* no changes */;
9191 if (PyUnicode_CheckExact(self)) {
9192 Py_DECREF(u);
9193 Py_INCREF(self);
9194 return self;
9195 }
9196 else
9197 return u;
9198 }
9199
Victor Stinnere6abb482012-05-02 01:15:40 +02009200 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009201
Victor Stinnereaab6042011-12-11 22:22:39 +01009202 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009203 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009204
9205 /* In case the maximum character changed, we need to
9206 convert the string to the new category. */
9207 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9208 if (v == NULL) {
9209 Py_DECREF(u);
9210 return NULL;
9211 }
9212 if (maxchar_new > maxchar_old) {
9213 /* If the maxchar increased so that the kind changed, not all
9214 characters are representable anymore and we need to fix the
9215 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009216 _PyUnicode_FastCopyCharacters(v, 0,
9217 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009218 maxchar_old = fixfct(v);
9219 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009220 }
9221 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009222 _PyUnicode_FastCopyCharacters(v, 0,
9223 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009225 Py_DECREF(u);
9226 assert(_PyUnicode_CheckConsistency(v, 1));
9227 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228}
9229
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009230static PyObject *
9231ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009233 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9234 char *resdata, *data = PyUnicode_DATA(self);
9235 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009236
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009237 res = PyUnicode_New(len, 127);
9238 if (res == NULL)
9239 return NULL;
9240 resdata = PyUnicode_DATA(res);
9241 if (lower)
9242 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009243 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009244 _Py_bytes_upper(resdata, data, len);
9245 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009246}
9247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009249handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009251 Py_ssize_t j;
9252 int final_sigma;
9253 Py_UCS4 c;
9254 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009255
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009256 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9257
9258 where ! is a negation and \p{xxx} is a character with property xxx.
9259 */
9260 for (j = i - 1; j >= 0; j--) {
9261 c = PyUnicode_READ(kind, data, j);
9262 if (!_PyUnicode_IsCaseIgnorable(c))
9263 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009265 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9266 if (final_sigma) {
9267 for (j = i + 1; j < length; j++) {
9268 c = PyUnicode_READ(kind, data, j);
9269 if (!_PyUnicode_IsCaseIgnorable(c))
9270 break;
9271 }
9272 final_sigma = j == length || !_PyUnicode_IsCased(c);
9273 }
9274 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009275}
9276
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009277static int
9278lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9279 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009280{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009281 /* Obscure special case. */
9282 if (c == 0x3A3) {
9283 mapped[0] = handle_capital_sigma(kind, data, length, i);
9284 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009285 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009286 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287}
9288
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009289static Py_ssize_t
9290do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009291{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009292 Py_ssize_t i, k = 0;
9293 int n_res, j;
9294 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009295
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009296 c = PyUnicode_READ(kind, data, 0);
9297 n_res = _PyUnicode_ToUpperFull(c, mapped);
9298 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009299 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009300 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009301 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009302 for (i = 1; i < length; i++) {
9303 c = PyUnicode_READ(kind, data, i);
9304 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9305 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009306 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009307 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009308 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009309 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009310 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009311}
9312
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009313static Py_ssize_t
9314do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9315 Py_ssize_t i, k = 0;
9316
9317 for (i = 0; i < length; i++) {
9318 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9319 int n_res, j;
9320 if (Py_UNICODE_ISUPPER(c)) {
9321 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9322 }
9323 else if (Py_UNICODE_ISLOWER(c)) {
9324 n_res = _PyUnicode_ToUpperFull(c, mapped);
9325 }
9326 else {
9327 n_res = 1;
9328 mapped[0] = c;
9329 }
9330 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009331 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009332 res[k++] = mapped[j];
9333 }
9334 }
9335 return k;
9336}
9337
9338static Py_ssize_t
9339do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9340 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009341{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009342 Py_ssize_t i, k = 0;
9343
9344 for (i = 0; i < length; i++) {
9345 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9346 int n_res, j;
9347 if (lower)
9348 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9349 else
9350 n_res = _PyUnicode_ToUpperFull(c, mapped);
9351 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009352 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009353 res[k++] = mapped[j];
9354 }
9355 }
9356 return k;
9357}
9358
9359static Py_ssize_t
9360do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9361{
9362 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9363}
9364
9365static Py_ssize_t
9366do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9367{
9368 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9369}
9370
Benjamin Petersone51757f2012-01-12 21:10:29 -05009371static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009372do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9373{
9374 Py_ssize_t i, k = 0;
9375
9376 for (i = 0; i < length; i++) {
9377 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9378 Py_UCS4 mapped[3];
9379 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9380 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009381 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009382 res[k++] = mapped[j];
9383 }
9384 }
9385 return k;
9386}
9387
9388static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009389do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9390{
9391 Py_ssize_t i, k = 0;
9392 int previous_is_cased;
9393
9394 previous_is_cased = 0;
9395 for (i = 0; i < length; i++) {
9396 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9397 Py_UCS4 mapped[3];
9398 int n_res, j;
9399
9400 if (previous_is_cased)
9401 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9402 else
9403 n_res = _PyUnicode_ToTitleFull(c, mapped);
9404
9405 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009406 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009407 res[k++] = mapped[j];
9408 }
9409
9410 previous_is_cased = _PyUnicode_IsCased(c);
9411 }
9412 return k;
9413}
9414
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009415static PyObject *
9416case_operation(PyObject *self,
9417 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9418{
9419 PyObject *res = NULL;
9420 Py_ssize_t length, newlength = 0;
9421 int kind, outkind;
9422 void *data, *outdata;
9423 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9424
Benjamin Petersoneea48462012-01-16 14:28:50 -05009425 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009426
9427 kind = PyUnicode_KIND(self);
9428 data = PyUnicode_DATA(self);
9429 length = PyUnicode_GET_LENGTH(self);
9430 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9431 if (tmp == NULL)
9432 return PyErr_NoMemory();
9433 newlength = perform(kind, data, length, tmp, &maxchar);
9434 res = PyUnicode_New(newlength, maxchar);
9435 if (res == NULL)
9436 goto leave;
9437 tmpend = tmp + newlength;
9438 outdata = PyUnicode_DATA(res);
9439 outkind = PyUnicode_KIND(res);
9440 switch (outkind) {
9441 case PyUnicode_1BYTE_KIND:
9442 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9443 break;
9444 case PyUnicode_2BYTE_KIND:
9445 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9446 break;
9447 case PyUnicode_4BYTE_KIND:
9448 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9449 break;
9450 default:
9451 assert(0);
9452 break;
9453 }
9454 leave:
9455 PyMem_FREE(tmp);
9456 return res;
9457}
9458
Tim Peters8ce9f162004-08-27 01:49:32 +00009459PyObject *
9460PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009461{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009463 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009465 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009466 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9467 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009468 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009470 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009472 int use_memcpy;
9473 unsigned char *res_data = NULL, *sep_data = NULL;
9474 PyObject *last_obj;
9475 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476
Tim Peters05eba1f2004-08-27 21:32:02 +00009477 fseq = PySequence_Fast(seq, "");
9478 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009479 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009480 }
9481
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009482 /* NOTE: the following code can't call back into Python code,
9483 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009484 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009485
Tim Peters05eba1f2004-08-27 21:32:02 +00009486 seqlen = PySequence_Fast_GET_SIZE(fseq);
9487 /* If empty sequence, return u"". */
9488 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009489 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009490 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009491 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009492
Tim Peters05eba1f2004-08-27 21:32:02 +00009493 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009494 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009495 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009496 if (seqlen == 1) {
9497 if (PyUnicode_CheckExact(items[0])) {
9498 res = items[0];
9499 Py_INCREF(res);
9500 Py_DECREF(fseq);
9501 return res;
9502 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009503 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009504 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009505 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009506 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009507 /* Set up sep and seplen */
9508 if (separator == NULL) {
9509 /* fall back to a blank space separator */
9510 sep = PyUnicode_FromOrdinal(' ');
9511 if (!sep)
9512 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009513 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009514 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009515 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009516 else {
9517 if (!PyUnicode_Check(separator)) {
9518 PyErr_Format(PyExc_TypeError,
9519 "separator: expected str instance,"
9520 " %.80s found",
9521 Py_TYPE(separator)->tp_name);
9522 goto onError;
9523 }
9524 if (PyUnicode_READY(separator))
9525 goto onError;
9526 sep = separator;
9527 seplen = PyUnicode_GET_LENGTH(separator);
9528 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9529 /* inc refcount to keep this code path symmetric with the
9530 above case of a blank separator */
9531 Py_INCREF(sep);
9532 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009533 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009534 }
9535
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009536 /* There are at least two things to join, or else we have a subclass
9537 * of str in the sequence.
9538 * Do a pre-pass to figure out the total amount of space we'll
9539 * need (sz), and see whether all argument are strings.
9540 */
9541 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009542#ifdef Py_DEBUG
9543 use_memcpy = 0;
9544#else
9545 use_memcpy = 1;
9546#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009547 for (i = 0; i < seqlen; i++) {
9548 const Py_ssize_t old_sz = sz;
9549 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009550 if (!PyUnicode_Check(item)) {
9551 PyErr_Format(PyExc_TypeError,
9552 "sequence item %zd: expected str instance,"
9553 " %.80s found",
9554 i, Py_TYPE(item)->tp_name);
9555 goto onError;
9556 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557 if (PyUnicode_READY(item) == -1)
9558 goto onError;
9559 sz += PyUnicode_GET_LENGTH(item);
9560 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009561 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009562 if (i != 0)
9563 sz += seplen;
9564 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9565 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009566 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009567 goto onError;
9568 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009569 if (use_memcpy && last_obj != NULL) {
9570 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9571 use_memcpy = 0;
9572 }
9573 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009574 }
Tim Petersced69f82003-09-16 20:30:58 +00009575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009576 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009577 if (res == NULL)
9578 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009579
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009580 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009581#ifdef Py_DEBUG
9582 use_memcpy = 0;
9583#else
9584 if (use_memcpy) {
9585 res_data = PyUnicode_1BYTE_DATA(res);
9586 kind = PyUnicode_KIND(res);
9587 if (seplen != 0)
9588 sep_data = PyUnicode_1BYTE_DATA(sep);
9589 }
9590#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009592 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009593 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009594 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009595 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009596 if (use_memcpy) {
9597 Py_MEMCPY(res_data,
9598 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009599 kind * seplen);
9600 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009601 }
9602 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009603 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009604 res_offset += seplen;
9605 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009606 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009607 itemlen = PyUnicode_GET_LENGTH(item);
9608 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009609 if (use_memcpy) {
9610 Py_MEMCPY(res_data,
9611 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009612 kind * itemlen);
9613 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009614 }
9615 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009616 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009617 res_offset += itemlen;
9618 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009619 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009620 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009621 if (use_memcpy)
9622 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009623 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009624 else
9625 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009626
Tim Peters05eba1f2004-08-27 21:32:02 +00009627 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009629 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631
Benjamin Peterson29060642009-01-31 22:14:21 +00009632 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009633 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009635 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636 return NULL;
9637}
9638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009639#define FILL(kind, data, value, start, length) \
9640 do { \
9641 Py_ssize_t i_ = 0; \
9642 assert(kind != PyUnicode_WCHAR_KIND); \
9643 switch ((kind)) { \
9644 case PyUnicode_1BYTE_KIND: { \
9645 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009646 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009647 break; \
9648 } \
9649 case PyUnicode_2BYTE_KIND: { \
9650 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9651 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9652 break; \
9653 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009654 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9656 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9657 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009658 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 } \
9660 } \
9661 } while (0)
9662
Victor Stinnerd3f08822012-05-29 12:57:52 +02009663void
9664_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9665 Py_UCS4 fill_char)
9666{
9667 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9668 const void *data = PyUnicode_DATA(unicode);
9669 assert(PyUnicode_IS_READY(unicode));
9670 assert(unicode_modifiable(unicode));
9671 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9672 assert(start >= 0);
9673 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9674 FILL(kind, data, fill_char, start, length);
9675}
9676
Victor Stinner3fe55312012-01-04 00:33:50 +01009677Py_ssize_t
9678PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9679 Py_UCS4 fill_char)
9680{
9681 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009682
9683 if (!PyUnicode_Check(unicode)) {
9684 PyErr_BadInternalCall();
9685 return -1;
9686 }
9687 if (PyUnicode_READY(unicode) == -1)
9688 return -1;
9689 if (unicode_check_modifiable(unicode))
9690 return -1;
9691
Victor Stinnerd3f08822012-05-29 12:57:52 +02009692 if (start < 0) {
9693 PyErr_SetString(PyExc_IndexError, "string index out of range");
9694 return -1;
9695 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009696 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9697 PyErr_SetString(PyExc_ValueError,
9698 "fill character is bigger than "
9699 "the string maximum character");
9700 return -1;
9701 }
9702
9703 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9704 length = Py_MIN(maxlen, length);
9705 if (length <= 0)
9706 return 0;
9707
Victor Stinnerd3f08822012-05-29 12:57:52 +02009708 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009709 return length;
9710}
9711
Victor Stinner9310abb2011-10-05 00:59:23 +02009712static PyObject *
9713pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009714 Py_ssize_t left,
9715 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 PyObject *u;
9719 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009720 int kind;
9721 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009722
9723 if (left < 0)
9724 left = 0;
9725 if (right < 0)
9726 right = 0;
9727
Victor Stinnerc4b49542011-12-11 22:44:26 +01009728 if (left == 0 && right == 0)
9729 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9732 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009733 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9734 return NULL;
9735 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009737 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009738 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009739 if (!u)
9740 return NULL;
9741
9742 kind = PyUnicode_KIND(u);
9743 data = PyUnicode_DATA(u);
9744 if (left)
9745 FILL(kind, data, fill, 0, left);
9746 if (right)
9747 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009748 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009749 assert(_PyUnicode_CheckConsistency(u, 1));
9750 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009751}
9752
Alexander Belopolsky40018472011-02-26 01:02:56 +00009753PyObject *
9754PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009757
9758 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009759 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009760 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009761 if (PyUnicode_READY(string) == -1) {
9762 Py_DECREF(string);
9763 return NULL;
9764 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765
Benjamin Petersonead6b532011-12-20 17:23:42 -06009766 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009768 if (PyUnicode_IS_ASCII(string))
9769 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009770 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009771 PyUnicode_GET_LENGTH(string), keepends);
9772 else
9773 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009774 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009775 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009776 break;
9777 case PyUnicode_2BYTE_KIND:
9778 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009779 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 PyUnicode_GET_LENGTH(string), keepends);
9781 break;
9782 case PyUnicode_4BYTE_KIND:
9783 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009784 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 PyUnicode_GET_LENGTH(string), keepends);
9786 break;
9787 default:
9788 assert(0);
9789 list = 0;
9790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009791 Py_DECREF(string);
9792 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009793}
9794
Alexander Belopolsky40018472011-02-26 01:02:56 +00009795static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009796split(PyObject *self,
9797 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009798 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009799{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009800 int kind1, kind2, kind;
9801 void *buf1, *buf2;
9802 Py_ssize_t len1, len2;
9803 PyObject* out;
9804
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009806 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009808 if (PyUnicode_READY(self) == -1)
9809 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009811 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009812 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009813 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009814 if (PyUnicode_IS_ASCII(self))
9815 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009816 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009817 PyUnicode_GET_LENGTH(self), maxcount
9818 );
9819 else
9820 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009821 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009822 PyUnicode_GET_LENGTH(self), maxcount
9823 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009824 case PyUnicode_2BYTE_KIND:
9825 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009826 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009827 PyUnicode_GET_LENGTH(self), maxcount
9828 );
9829 case PyUnicode_4BYTE_KIND:
9830 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009831 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 PyUnicode_GET_LENGTH(self), maxcount
9833 );
9834 default:
9835 assert(0);
9836 return NULL;
9837 }
9838
9839 if (PyUnicode_READY(substring) == -1)
9840 return NULL;
9841
9842 kind1 = PyUnicode_KIND(self);
9843 kind2 = PyUnicode_KIND(substring);
9844 kind = kind1 > kind2 ? kind1 : kind2;
9845 buf1 = PyUnicode_DATA(self);
9846 buf2 = PyUnicode_DATA(substring);
9847 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009848 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849 if (!buf1)
9850 return NULL;
9851 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009852 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 if (!buf2) {
9854 if (kind1 != kind) PyMem_Free(buf1);
9855 return NULL;
9856 }
9857 len1 = PyUnicode_GET_LENGTH(self);
9858 len2 = PyUnicode_GET_LENGTH(substring);
9859
Benjamin Petersonead6b532011-12-20 17:23:42 -06009860 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009862 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9863 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009864 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009865 else
9866 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009867 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868 break;
9869 case PyUnicode_2BYTE_KIND:
9870 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009871 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 break;
9873 case PyUnicode_4BYTE_KIND:
9874 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009875 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 break;
9877 default:
9878 out = NULL;
9879 }
9880 if (kind1 != kind)
9881 PyMem_Free(buf1);
9882 if (kind2 != kind)
9883 PyMem_Free(buf2);
9884 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009885}
9886
Alexander Belopolsky40018472011-02-26 01:02:56 +00009887static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009888rsplit(PyObject *self,
9889 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009890 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009891{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009892 int kind1, kind2, kind;
9893 void *buf1, *buf2;
9894 Py_ssize_t len1, len2;
9895 PyObject* out;
9896
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009897 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009898 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 if (PyUnicode_READY(self) == -1)
9901 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009904 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009906 if (PyUnicode_IS_ASCII(self))
9907 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009908 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009909 PyUnicode_GET_LENGTH(self), maxcount
9910 );
9911 else
9912 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009913 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009914 PyUnicode_GET_LENGTH(self), maxcount
9915 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009916 case PyUnicode_2BYTE_KIND:
9917 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009918 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009919 PyUnicode_GET_LENGTH(self), maxcount
9920 );
9921 case PyUnicode_4BYTE_KIND:
9922 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009923 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009924 PyUnicode_GET_LENGTH(self), maxcount
9925 );
9926 default:
9927 assert(0);
9928 return NULL;
9929 }
9930
9931 if (PyUnicode_READY(substring) == -1)
9932 return NULL;
9933
9934 kind1 = PyUnicode_KIND(self);
9935 kind2 = PyUnicode_KIND(substring);
9936 kind = kind1 > kind2 ? kind1 : kind2;
9937 buf1 = PyUnicode_DATA(self);
9938 buf2 = PyUnicode_DATA(substring);
9939 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009940 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 if (!buf1)
9942 return NULL;
9943 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009944 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 if (!buf2) {
9946 if (kind1 != kind) PyMem_Free(buf1);
9947 return NULL;
9948 }
9949 len1 = PyUnicode_GET_LENGTH(self);
9950 len2 = PyUnicode_GET_LENGTH(substring);
9951
Benjamin Petersonead6b532011-12-20 17:23:42 -06009952 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009954 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9955 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009956 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009957 else
9958 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009959 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009960 break;
9961 case PyUnicode_2BYTE_KIND:
9962 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009963 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 break;
9965 case PyUnicode_4BYTE_KIND:
9966 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009967 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 break;
9969 default:
9970 out = NULL;
9971 }
9972 if (kind1 != kind)
9973 PyMem_Free(buf1);
9974 if (kind2 != kind)
9975 PyMem_Free(buf2);
9976 return out;
9977}
9978
9979static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009980anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9981 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009983 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009984 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009985 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9986 return asciilib_find(buf1, len1, buf2, len2, offset);
9987 else
9988 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009989 case PyUnicode_2BYTE_KIND:
9990 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9991 case PyUnicode_4BYTE_KIND:
9992 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9993 }
9994 assert(0);
9995 return -1;
9996}
9997
9998static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009999anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10000 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010002 switch (kind) {
10003 case PyUnicode_1BYTE_KIND:
10004 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10005 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10006 else
10007 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10008 case PyUnicode_2BYTE_KIND:
10009 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10010 case PyUnicode_4BYTE_KIND:
10011 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10012 }
10013 assert(0);
10014 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010015}
10016
Alexander Belopolsky40018472011-02-26 01:02:56 +000010017static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018replace(PyObject *self, PyObject *str1,
10019 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 PyObject *u;
10022 char *sbuf = PyUnicode_DATA(self);
10023 char *buf1 = PyUnicode_DATA(str1);
10024 char *buf2 = PyUnicode_DATA(str2);
10025 int srelease = 0, release1 = 0, release2 = 0;
10026 int skind = PyUnicode_KIND(self);
10027 int kind1 = PyUnicode_KIND(str1);
10028 int kind2 = PyUnicode_KIND(str2);
10029 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10030 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10031 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010032 int mayshrink;
10033 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034
10035 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010036 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010038 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010039
Victor Stinner59de0ee2011-10-07 10:01:28 +020010040 if (str1 == str2)
10041 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 if (skind < kind1)
10043 /* substring too wide to be present */
10044 goto nothing;
10045
Victor Stinner49a0a212011-10-12 23:46:10 +020010046 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10047 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10048 /* Replacing str1 with str2 may cause a maxchar reduction in the
10049 result string. */
10050 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +020010051 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010054 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010055 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010056 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010058 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010059 Py_UCS4 u1, u2;
10060 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010061 Py_ssize_t index, pos;
10062 char *src;
10063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010065 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10066 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010067 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010070 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010071 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010072 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010074
10075 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10076 index = 0;
10077 src = sbuf;
10078 while (--maxcount)
10079 {
10080 pos++;
10081 src += pos * PyUnicode_KIND(self);
10082 slen -= pos;
10083 index += pos;
10084 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10085 if (pos < 0)
10086 break;
10087 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10088 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010089 }
10090 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010091 int rkind = skind;
10092 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010093 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010094
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 if (kind1 < rkind) {
10096 /* widen substring */
10097 buf1 = _PyUnicode_AsKind(str1, rkind);
10098 if (!buf1) goto error;
10099 release1 = 1;
10100 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010101 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010102 if (i < 0)
10103 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 if (rkind > kind2) {
10105 /* widen replacement */
10106 buf2 = _PyUnicode_AsKind(str2, rkind);
10107 if (!buf2) goto error;
10108 release2 = 1;
10109 }
10110 else if (rkind < kind2) {
10111 /* widen self and buf1 */
10112 rkind = kind2;
10113 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010114 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 sbuf = _PyUnicode_AsKind(self, rkind);
10116 if (!sbuf) goto error;
10117 srelease = 1;
10118 buf1 = _PyUnicode_AsKind(str1, rkind);
10119 if (!buf1) goto error;
10120 release1 = 1;
10121 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010122 u = PyUnicode_New(slen, maxchar);
10123 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010125 assert(PyUnicode_KIND(u) == rkind);
10126 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010127
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010128 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010129 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010130 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010132 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010134
10135 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010136 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010137 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010138 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010139 if (i == -1)
10140 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010141 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010143 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010147 }
10148 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010149 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010150 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 int rkind = skind;
10152 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010155 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 buf1 = _PyUnicode_AsKind(str1, rkind);
10157 if (!buf1) goto error;
10158 release1 = 1;
10159 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010160 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010161 if (n == 0)
10162 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010164 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 buf2 = _PyUnicode_AsKind(str2, rkind);
10166 if (!buf2) goto error;
10167 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010168 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010170 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 rkind = kind2;
10172 sbuf = _PyUnicode_AsKind(self, rkind);
10173 if (!sbuf) goto error;
10174 srelease = 1;
10175 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010176 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 buf1 = _PyUnicode_AsKind(str1, rkind);
10178 if (!buf1) goto error;
10179 release1 = 1;
10180 }
10181 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10182 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010183 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 PyErr_SetString(PyExc_OverflowError,
10185 "replace string is too long");
10186 goto error;
10187 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010188 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010189 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010190 _Py_INCREF_UNICODE_EMPTY();
10191 if (!unicode_empty)
10192 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010193 u = unicode_empty;
10194 goto done;
10195 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010196 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 PyErr_SetString(PyExc_OverflowError,
10198 "replace string is too long");
10199 goto error;
10200 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010201 u = PyUnicode_New(new_size, maxchar);
10202 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010203 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010204 assert(PyUnicode_KIND(u) == rkind);
10205 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 ires = i = 0;
10207 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010208 while (n-- > 0) {
10209 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010210 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010211 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010212 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010213 if (j == -1)
10214 break;
10215 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010216 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010217 memcpy(res + rkind * ires,
10218 sbuf + rkind * i,
10219 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010221 }
10222 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010224 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010226 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010228 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010230 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010232 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010233 memcpy(res + rkind * ires,
10234 sbuf + rkind * i,
10235 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010236 }
10237 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010238 /* interleave */
10239 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010240 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010242 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010244 if (--n <= 0)
10245 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010246 memcpy(res + rkind * ires,
10247 sbuf + rkind * i,
10248 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 ires++;
10250 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010251 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010252 memcpy(res + rkind * ires,
10253 sbuf + rkind * i,
10254 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010255 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010256 }
10257
10258 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010259 unicode_adjust_maxchar(&u);
10260 if (u == NULL)
10261 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010262 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010263
10264 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 if (srelease)
10266 PyMem_FREE(sbuf);
10267 if (release1)
10268 PyMem_FREE(buf1);
10269 if (release2)
10270 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010271 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010273
Benjamin Peterson29060642009-01-31 22:14:21 +000010274 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010275 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 if (srelease)
10277 PyMem_FREE(sbuf);
10278 if (release1)
10279 PyMem_FREE(buf1);
10280 if (release2)
10281 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010282 return unicode_result_unchanged(self);
10283
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 error:
10285 if (srelease && sbuf)
10286 PyMem_FREE(sbuf);
10287 if (release1 && buf1)
10288 PyMem_FREE(buf1);
10289 if (release2 && buf2)
10290 PyMem_FREE(buf2);
10291 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292}
10293
10294/* --- Unicode Object Methods --------------------------------------------- */
10295
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010296PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010297 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298\n\
10299Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010300characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010301
10302static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010303unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010304{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010305 if (PyUnicode_READY(self) == -1)
10306 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010307 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308}
10309
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010310PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010311 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312\n\
10313Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010314have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010315
10316static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010317unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010319 if (PyUnicode_READY(self) == -1)
10320 return NULL;
10321 if (PyUnicode_GET_LENGTH(self) == 0)
10322 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010323 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324}
10325
Benjamin Petersond5890c82012-01-14 13:23:30 -050010326PyDoc_STRVAR(casefold__doc__,
10327 "S.casefold() -> str\n\
10328\n\
10329Return a version of S suitable for caseless comparisons.");
10330
10331static PyObject *
10332unicode_casefold(PyObject *self)
10333{
10334 if (PyUnicode_READY(self) == -1)
10335 return NULL;
10336 if (PyUnicode_IS_ASCII(self))
10337 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010338 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010339}
10340
10341
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010342/* Argument converter. Coerces to a single unicode character */
10343
10344static int
10345convert_uc(PyObject *obj, void *addr)
10346{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010348 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010349
Benjamin Peterson14339b62009-01-31 16:36:08 +000010350 uniobj = PyUnicode_FromObject(obj);
10351 if (uniobj == NULL) {
10352 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010353 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010354 return 0;
10355 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010357 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010358 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010359 Py_DECREF(uniobj);
10360 return 0;
10361 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010363 Py_DECREF(uniobj);
10364 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010365}
10366
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010367PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010368 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010369\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010370Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010371done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010372
10373static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010374unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010375{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010376 Py_ssize_t marg, left;
10377 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 Py_UCS4 fillchar = ' ';
10379
Victor Stinnere9a29352011-10-01 02:14:59 +020010380 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010382
Benjamin Petersonbac79492012-01-14 13:34:47 -050010383 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010384 return NULL;
10385
Victor Stinnerc4b49542011-12-11 22:44:26 +010010386 if (PyUnicode_GET_LENGTH(self) >= width)
10387 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388
Victor Stinnerc4b49542011-12-11 22:44:26 +010010389 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390 left = marg / 2 + (marg & width & 1);
10391
Victor Stinner9310abb2011-10-05 00:59:23 +020010392 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393}
10394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395/* This function assumes that str1 and str2 are readied by the caller. */
10396
Marc-André Lemburge5034372000-08-08 08:04:29 +000010397static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010398unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010399{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 int kind1, kind2;
10401 void *data1, *data2;
10402 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010403
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 kind1 = PyUnicode_KIND(str1);
10405 kind2 = PyUnicode_KIND(str2);
10406 data1 = PyUnicode_DATA(str1);
10407 data2 = PyUnicode_DATA(str2);
10408 len1 = PyUnicode_GET_LENGTH(str1);
10409 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411 for (i = 0; i < len1 && i < len2; ++i) {
10412 Py_UCS4 c1, c2;
10413 c1 = PyUnicode_READ(kind1, data1, i);
10414 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010415
10416 if (c1 != c2)
10417 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010418 }
10419
10420 return (len1 < len2) ? -1 : (len1 != len2);
10421}
10422
Alexander Belopolsky40018472011-02-26 01:02:56 +000010423int
10424PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010425{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10427 if (PyUnicode_READY(left) == -1 ||
10428 PyUnicode_READY(right) == -1)
10429 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010430 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010432 PyErr_Format(PyExc_TypeError,
10433 "Can't compare %.100s and %.100s",
10434 left->ob_type->tp_name,
10435 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010436 return -1;
10437}
10438
Martin v. Löwis5b222132007-06-10 09:51:05 +000010439int
10440PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10441{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 Py_ssize_t i;
10443 int kind;
10444 void *data;
10445 Py_UCS4 chr;
10446
Victor Stinner910337b2011-10-03 03:20:16 +020010447 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 if (PyUnicode_READY(uni) == -1)
10449 return -1;
10450 kind = PyUnicode_KIND(uni);
10451 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010452 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010453 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10454 if (chr != str[i])
10455 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010456 /* This check keeps Python strings that end in '\0' from comparing equal
10457 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010459 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010460 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010461 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010462 return 0;
10463}
10464
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010465
Benjamin Peterson29060642009-01-31 22:14:21 +000010466#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010467 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010468
Alexander Belopolsky40018472011-02-26 01:02:56 +000010469PyObject *
10470PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010471{
10472 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010473
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010474 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10475 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 if (PyUnicode_READY(left) == -1 ||
10477 PyUnicode_READY(right) == -1)
10478 return NULL;
10479 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10480 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010481 if (op == Py_EQ) {
10482 Py_INCREF(Py_False);
10483 return Py_False;
10484 }
10485 if (op == Py_NE) {
10486 Py_INCREF(Py_True);
10487 return Py_True;
10488 }
10489 }
10490 if (left == right)
10491 result = 0;
10492 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010493 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010494
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010495 /* Convert the return value to a Boolean */
10496 switch (op) {
10497 case Py_EQ:
10498 v = TEST_COND(result == 0);
10499 break;
10500 case Py_NE:
10501 v = TEST_COND(result != 0);
10502 break;
10503 case Py_LE:
10504 v = TEST_COND(result <= 0);
10505 break;
10506 case Py_GE:
10507 v = TEST_COND(result >= 0);
10508 break;
10509 case Py_LT:
10510 v = TEST_COND(result == -1);
10511 break;
10512 case Py_GT:
10513 v = TEST_COND(result == 1);
10514 break;
10515 default:
10516 PyErr_BadArgument();
10517 return NULL;
10518 }
10519 Py_INCREF(v);
10520 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010521 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010522
Brian Curtindfc80e32011-08-10 20:28:54 -050010523 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010524}
10525
Alexander Belopolsky40018472011-02-26 01:02:56 +000010526int
10527PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010528{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010529 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 int kind1, kind2, kind;
10531 void *buf1, *buf2;
10532 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010533 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010534
10535 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010536 sub = PyUnicode_FromObject(element);
10537 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010538 PyErr_Format(PyExc_TypeError,
10539 "'in <string>' requires string as left operand, not %s",
10540 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010541 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010542 }
10543
Thomas Wouters477c8d52006-05-27 19:21:47 +000010544 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010545 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010546 Py_DECREF(sub);
10547 return -1;
10548 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010549 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10550 Py_DECREF(sub);
10551 Py_DECREF(str);
10552 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 kind1 = PyUnicode_KIND(str);
10555 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010556 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010557 buf1 = PyUnicode_DATA(str);
10558 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010559 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010560 if (kind2 > kind) {
10561 Py_DECREF(sub);
10562 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010563 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010564 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010565 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010566 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 if (!buf2) {
10568 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010569 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010570 return -1;
10571 }
10572 len1 = PyUnicode_GET_LENGTH(str);
10573 len2 = PyUnicode_GET_LENGTH(sub);
10574
Benjamin Petersonead6b532011-12-20 17:23:42 -060010575 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 case PyUnicode_1BYTE_KIND:
10577 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10578 break;
10579 case PyUnicode_2BYTE_KIND:
10580 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10581 break;
10582 case PyUnicode_4BYTE_KIND:
10583 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10584 break;
10585 default:
10586 result = -1;
10587 assert(0);
10588 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010589
10590 Py_DECREF(str);
10591 Py_DECREF(sub);
10592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 if (kind2 != kind)
10594 PyMem_Free(buf2);
10595
Guido van Rossum403d68b2000-03-13 15:55:09 +000010596 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010597}
10598
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599/* Concat to string or Unicode object giving a new Unicode object. */
10600
Alexander Belopolsky40018472011-02-26 01:02:56 +000010601PyObject *
10602PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010604 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010605 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010606 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607
10608 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010611 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010614 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615
10616 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010617 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010618 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010620 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010621 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010622 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010624 }
10625
Victor Stinner488fa492011-12-12 00:01:39 +010010626 u_len = PyUnicode_GET_LENGTH(u);
10627 v_len = PyUnicode_GET_LENGTH(v);
10628 if (u_len > PY_SSIZE_T_MAX - v_len) {
10629 PyErr_SetString(PyExc_OverflowError,
10630 "strings are too large to concat");
10631 goto onError;
10632 }
10633 new_len = u_len + v_len;
10634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010636 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010637 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638
Guido van Rossumd57fd912000-03-10 22:53:23 +000010639 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010640 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010642 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010643 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10644 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645 Py_DECREF(u);
10646 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010647 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649
Benjamin Peterson29060642009-01-31 22:14:21 +000010650 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010651 Py_XDECREF(u);
10652 Py_XDECREF(v);
10653 return NULL;
10654}
10655
Walter Dörwald1ab83302007-05-18 17:15:44 +000010656void
Victor Stinner23e56682011-10-03 03:54:37 +020010657PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010658{
Victor Stinner23e56682011-10-03 03:54:37 +020010659 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010660 Py_UCS4 maxchar, maxchar2;
10661 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010662
10663 if (p_left == NULL) {
10664 if (!PyErr_Occurred())
10665 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010666 return;
10667 }
Victor Stinner23e56682011-10-03 03:54:37 +020010668 left = *p_left;
Serhiy Storchaka6c83e732013-01-04 12:39:34 +020010669 if (right == NULL || left == NULL || !PyUnicode_Check(left)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010670 if (!PyErr_Occurred())
10671 PyErr_BadInternalCall();
10672 goto error;
10673 }
10674
Benjamin Petersonbac79492012-01-14 13:34:47 -050010675 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010676 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010677 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010678 goto error;
10679
Victor Stinner488fa492011-12-12 00:01:39 +010010680 /* Shortcuts */
10681 if (left == unicode_empty) {
10682 Py_DECREF(left);
10683 Py_INCREF(right);
10684 *p_left = right;
10685 return;
10686 }
10687 if (right == unicode_empty)
10688 return;
10689
10690 left_len = PyUnicode_GET_LENGTH(left);
10691 right_len = PyUnicode_GET_LENGTH(right);
10692 if (left_len > PY_SSIZE_T_MAX - right_len) {
10693 PyErr_SetString(PyExc_OverflowError,
10694 "strings are too large to concat");
10695 goto error;
10696 }
10697 new_len = left_len + right_len;
10698
10699 if (unicode_modifiable(left)
10700 && PyUnicode_CheckExact(right)
10701 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010702 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10703 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010704 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010705 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010706 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10707 {
10708 /* append inplace */
10709 if (unicode_resize(p_left, new_len) != 0) {
10710 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10711 * deallocated so it cannot be put back into
10712 * 'variable'. The MemoryError is raised when there
10713 * is no value in 'variable', which might (very
10714 * remotely) be a cause of incompatibilities.
10715 */
10716 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010717 }
Victor Stinner488fa492011-12-12 00:01:39 +010010718 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010719 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010720 }
Victor Stinner488fa492011-12-12 00:01:39 +010010721 else {
10722 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10723 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010724 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010725
Victor Stinner488fa492011-12-12 00:01:39 +010010726 /* Concat the two Unicode strings */
10727 res = PyUnicode_New(new_len, maxchar);
10728 if (res == NULL)
10729 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010730 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10731 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010732 Py_DECREF(left);
10733 *p_left = res;
10734 }
10735 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010736 return;
10737
10738error:
Victor Stinner488fa492011-12-12 00:01:39 +010010739 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010740}
10741
10742void
10743PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10744{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010745 PyUnicode_Append(pleft, right);
10746 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010747}
10748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010749PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010750 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010752Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010753string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010754interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755
10756static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010757unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010758{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010759 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010760 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010761 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763 int kind1, kind2, kind;
10764 void *buf1, *buf2;
10765 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766
Jesus Ceaac451502011-04-20 17:09:23 +020010767 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10768 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010769 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010770
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 kind1 = PyUnicode_KIND(self);
10772 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010773 if (kind2 > kind1)
10774 return PyLong_FromLong(0);
10775 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 buf1 = PyUnicode_DATA(self);
10777 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010779 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 if (!buf2) {
10781 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 return NULL;
10783 }
10784 len1 = PyUnicode_GET_LENGTH(self);
10785 len2 = PyUnicode_GET_LENGTH(substring);
10786
10787 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010788 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 case PyUnicode_1BYTE_KIND:
10790 iresult = ucs1lib_count(
10791 ((Py_UCS1*)buf1) + start, end - start,
10792 buf2, len2, PY_SSIZE_T_MAX
10793 );
10794 break;
10795 case PyUnicode_2BYTE_KIND:
10796 iresult = ucs2lib_count(
10797 ((Py_UCS2*)buf1) + start, end - start,
10798 buf2, len2, PY_SSIZE_T_MAX
10799 );
10800 break;
10801 case PyUnicode_4BYTE_KIND:
10802 iresult = ucs4lib_count(
10803 ((Py_UCS4*)buf1) + start, end - start,
10804 buf2, len2, PY_SSIZE_T_MAX
10805 );
10806 break;
10807 default:
10808 assert(0); iresult = 0;
10809 }
10810
10811 result = PyLong_FromSsize_t(iresult);
10812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010813 if (kind2 != kind)
10814 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815
10816 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010817
Guido van Rossumd57fd912000-03-10 22:53:23 +000010818 return result;
10819}
10820
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010821PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010822 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010824Encode S using the codec registered for encoding. Default encoding\n\
10825is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010826handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010827a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10828'xmlcharrefreplace' as well as any other name registered with\n\
10829codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830
10831static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010832unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010833{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010834 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835 char *encoding = NULL;
10836 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010837
Benjamin Peterson308d6372009-09-18 21:42:35 +000010838 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10839 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010841 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010842}
10843
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010844PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010845 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846\n\
10847Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010848If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849
10850static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010851unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010853 Py_ssize_t i, j, line_pos, src_len, incr;
10854 Py_UCS4 ch;
10855 PyObject *u;
10856 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010858 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010859 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860
10861 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010862 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010863
Antoine Pitrou22425222011-10-04 19:10:51 +020010864 if (PyUnicode_READY(self) == -1)
10865 return NULL;
10866
Thomas Wouters7e474022000-07-16 12:04:32 +000010867 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010868 src_len = PyUnicode_GET_LENGTH(self);
10869 i = j = line_pos = 0;
10870 kind = PyUnicode_KIND(self);
10871 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010872 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010873 for (; i < src_len; i++) {
10874 ch = PyUnicode_READ(kind, src_data, i);
10875 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010876 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010877 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010878 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010879 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010880 goto overflow;
10881 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010882 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010883 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010884 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010886 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010887 goto overflow;
10888 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010890 if (ch == '\n' || ch == '\r')
10891 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010892 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010893 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010894 if (!found)
10895 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010896
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010898 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010899 if (!u)
10900 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010901 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010902
Antoine Pitroue71d5742011-10-04 15:55:09 +020010903 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010904
Antoine Pitroue71d5742011-10-04 15:55:09 +020010905 for (; i < src_len; i++) {
10906 ch = PyUnicode_READ(kind, src_data, i);
10907 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010908 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010909 incr = tabsize - (line_pos % tabsize);
10910 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010911 FILL(kind, dest_data, ' ', j, incr);
10912 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010913 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010914 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010915 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010916 line_pos++;
10917 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010918 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010919 if (ch == '\n' || ch == '\r')
10920 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010921 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010922 }
10923 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010924 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010925
Antoine Pitroue71d5742011-10-04 15:55:09 +020010926 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010927 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10928 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929}
10930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010931PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010932 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933\n\
10934Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010935such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010936arguments start and end are interpreted as in slice notation.\n\
10937\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010938Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939
10940static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010942{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010943 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010944 Py_ssize_t start;
10945 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010946 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947
Jesus Ceaac451502011-04-20 17:09:23 +020010948 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10949 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 if (PyUnicode_READY(self) == -1)
10953 return NULL;
10954 if (PyUnicode_READY(substring) == -1)
10955 return NULL;
10956
Victor Stinner7931d9a2011-11-04 00:22:48 +010010957 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010958
10959 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010961 if (result == -2)
10962 return NULL;
10963
Christian Heimes217cfd12007-12-02 14:31:20 +000010964 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010965}
10966
10967static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010968unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010970 void *data;
10971 enum PyUnicode_Kind kind;
10972 Py_UCS4 ch;
10973 PyObject *res;
10974
10975 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10976 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010977 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010978 }
10979 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10980 PyErr_SetString(PyExc_IndexError, "string index out of range");
10981 return NULL;
10982 }
10983 kind = PyUnicode_KIND(self);
10984 data = PyUnicode_DATA(self);
10985 ch = PyUnicode_READ(kind, data, index);
10986 if (ch < 256)
10987 return get_latin1_char(ch);
10988
10989 res = PyUnicode_New(1, ch);
10990 if (res == NULL)
10991 return NULL;
10992 kind = PyUnicode_KIND(res);
10993 data = PyUnicode_DATA(res);
10994 PyUnicode_WRITE(kind, data, 0, ch);
10995 assert(_PyUnicode_CheckConsistency(res, 1));
10996 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997}
10998
Guido van Rossumc2504932007-09-18 19:42:40 +000010999/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011000 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011001static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011002unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003{
Guido van Rossumc2504932007-09-18 19:42:40 +000011004 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011005 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011006
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011007#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011008 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011009#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011010 if (_PyUnicode_HASH(self) != -1)
11011 return _PyUnicode_HASH(self);
11012 if (PyUnicode_READY(self) == -1)
11013 return -1;
11014 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011015 /*
11016 We make the hash of the empty string be 0, rather than using
11017 (prefix ^ suffix), since this slightly obfuscates the hash secret
11018 */
11019 if (len == 0) {
11020 _PyUnicode_HASH(self) = 0;
11021 return 0;
11022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023
11024 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011025#define HASH(P) \
11026 x ^= (Py_uhash_t) *P << 7; \
11027 while (--len >= 0) \
11028 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029
Georg Brandl2fb477c2012-02-21 00:33:36 +010011030 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011031 switch (PyUnicode_KIND(self)) {
11032 case PyUnicode_1BYTE_KIND: {
11033 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11034 HASH(c);
11035 break;
11036 }
11037 case PyUnicode_2BYTE_KIND: {
11038 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11039 HASH(s);
11040 break;
11041 }
11042 default: {
11043 Py_UCS4 *l;
11044 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11045 "Impossible switch case in unicode_hash");
11046 l = PyUnicode_4BYTE_DATA(self);
11047 HASH(l);
11048 break;
11049 }
11050 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011051 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11052 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053
Guido van Rossumc2504932007-09-18 19:42:40 +000011054 if (x == -1)
11055 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011056 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011057 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011059#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011061PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011062 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011064Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065
11066static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011067unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011069 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011070 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011071 Py_ssize_t start;
11072 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073
Jesus Ceaac451502011-04-20 17:09:23 +020011074 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11075 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011076 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011078 if (PyUnicode_READY(self) == -1)
11079 return NULL;
11080 if (PyUnicode_READY(substring) == -1)
11081 return NULL;
11082
Victor Stinner7931d9a2011-11-04 00:22:48 +010011083 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084
11085 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087 if (result == -2)
11088 return NULL;
11089
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090 if (result < 0) {
11091 PyErr_SetString(PyExc_ValueError, "substring not found");
11092 return NULL;
11093 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011094
Christian Heimes217cfd12007-12-02 14:31:20 +000011095 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096}
11097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011098PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011099 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011101Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011102at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103
11104static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011105unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011107 Py_ssize_t i, length;
11108 int kind;
11109 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110 int cased;
11111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011112 if (PyUnicode_READY(self) == -1)
11113 return NULL;
11114 length = PyUnicode_GET_LENGTH(self);
11115 kind = PyUnicode_KIND(self);
11116 data = PyUnicode_DATA(self);
11117
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 if (length == 1)
11120 return PyBool_FromLong(
11121 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011123 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011124 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011125 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011126
Guido van Rossumd57fd912000-03-10 22:53:23 +000011127 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011128 for (i = 0; i < length; i++) {
11129 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011130
Benjamin Peterson29060642009-01-31 22:14:21 +000011131 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11132 return PyBool_FromLong(0);
11133 else if (!cased && Py_UNICODE_ISLOWER(ch))
11134 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011136 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137}
11138
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011139PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011140 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011142Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011143at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011144
11145static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011146unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 Py_ssize_t i, length;
11149 int kind;
11150 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151 int cased;
11152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153 if (PyUnicode_READY(self) == -1)
11154 return NULL;
11155 length = PyUnicode_GET_LENGTH(self);
11156 kind = PyUnicode_KIND(self);
11157 data = PyUnicode_DATA(self);
11158
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160 if (length == 1)
11161 return PyBool_FromLong(
11162 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011164 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011166 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011167
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 for (i = 0; i < length; i++) {
11170 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011171
Benjamin Peterson29060642009-01-31 22:14:21 +000011172 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11173 return PyBool_FromLong(0);
11174 else if (!cased && Py_UNICODE_ISUPPER(ch))
11175 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011177 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178}
11179
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011180PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011181 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011183Return True if S is a titlecased string and there is at least one\n\
11184character in S, i.e. upper- and titlecase characters may only\n\
11185follow uncased characters and lowercase characters only cased ones.\n\
11186Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187
11188static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011189unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011191 Py_ssize_t i, length;
11192 int kind;
11193 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194 int cased, previous_is_cased;
11195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 if (PyUnicode_READY(self) == -1)
11197 return NULL;
11198 length = PyUnicode_GET_LENGTH(self);
11199 kind = PyUnicode_KIND(self);
11200 data = PyUnicode_DATA(self);
11201
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011203 if (length == 1) {
11204 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11205 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11206 (Py_UNICODE_ISUPPER(ch) != 0));
11207 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011209 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011210 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011211 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011212
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213 cased = 0;
11214 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215 for (i = 0; i < length; i++) {
11216 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011217
Benjamin Peterson29060642009-01-31 22:14:21 +000011218 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11219 if (previous_is_cased)
11220 return PyBool_FromLong(0);
11221 previous_is_cased = 1;
11222 cased = 1;
11223 }
11224 else if (Py_UNICODE_ISLOWER(ch)) {
11225 if (!previous_is_cased)
11226 return PyBool_FromLong(0);
11227 previous_is_cased = 1;
11228 cased = 1;
11229 }
11230 else
11231 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011233 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234}
11235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011236PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011237 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011239Return True if all characters in S are whitespace\n\
11240and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241
11242static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011243unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 Py_ssize_t i, length;
11246 int kind;
11247 void *data;
11248
11249 if (PyUnicode_READY(self) == -1)
11250 return NULL;
11251 length = PyUnicode_GET_LENGTH(self);
11252 kind = PyUnicode_KIND(self);
11253 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011254
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 if (length == 1)
11257 return PyBool_FromLong(
11258 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011260 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011261 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011262 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011264 for (i = 0; i < length; i++) {
11265 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011266 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011267 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011269 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270}
11271
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011272PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011273 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011274\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011275Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011276and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011277
11278static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011279unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011280{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011281 Py_ssize_t i, length;
11282 int kind;
11283 void *data;
11284
11285 if (PyUnicode_READY(self) == -1)
11286 return NULL;
11287 length = PyUnicode_GET_LENGTH(self);
11288 kind = PyUnicode_KIND(self);
11289 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011290
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011291 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011292 if (length == 1)
11293 return PyBool_FromLong(
11294 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011295
11296 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011297 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011298 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011300 for (i = 0; i < length; i++) {
11301 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011302 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011303 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011304 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011305}
11306
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011307PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011308 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011309\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011310Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011311and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011312
11313static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011314unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011315{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011316 int kind;
11317 void *data;
11318 Py_ssize_t len, i;
11319
11320 if (PyUnicode_READY(self) == -1)
11321 return NULL;
11322
11323 kind = PyUnicode_KIND(self);
11324 data = PyUnicode_DATA(self);
11325 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011326
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011327 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011328 if (len == 1) {
11329 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11330 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11331 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011332
11333 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011334 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011335 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 for (i = 0; i < len; i++) {
11338 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011339 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011340 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011341 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011342 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011343}
11344
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011345PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011346 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011348Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011349False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350
11351static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011352unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011354 Py_ssize_t i, length;
11355 int kind;
11356 void *data;
11357
11358 if (PyUnicode_READY(self) == -1)
11359 return NULL;
11360 length = PyUnicode_GET_LENGTH(self);
11361 kind = PyUnicode_KIND(self);
11362 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365 if (length == 1)
11366 return PyBool_FromLong(
11367 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011369 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011370 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011371 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 for (i = 0; i < length; i++) {
11374 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011375 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011377 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011378}
11379
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011380PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011381 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011383Return True if all characters in S are digits\n\
11384and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385
11386static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011387unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011388{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011389 Py_ssize_t i, length;
11390 int kind;
11391 void *data;
11392
11393 if (PyUnicode_READY(self) == -1)
11394 return NULL;
11395 length = PyUnicode_GET_LENGTH(self);
11396 kind = PyUnicode_KIND(self);
11397 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 if (length == 1) {
11401 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11402 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11403 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011405 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011406 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011407 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 for (i = 0; i < length; i++) {
11410 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011411 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011412 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011413 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414}
11415
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011416PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011417 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011419Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011420False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421
11422static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011423unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011425 Py_ssize_t i, length;
11426 int kind;
11427 void *data;
11428
11429 if (PyUnicode_READY(self) == -1)
11430 return NULL;
11431 length = PyUnicode_GET_LENGTH(self);
11432 kind = PyUnicode_KIND(self);
11433 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011436 if (length == 1)
11437 return PyBool_FromLong(
11438 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011440 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011442 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 for (i = 0; i < length; i++) {
11445 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011446 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011448 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449}
11450
Martin v. Löwis47383402007-08-15 07:32:56 +000011451int
11452PyUnicode_IsIdentifier(PyObject *self)
11453{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 int kind;
11455 void *data;
11456 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011457 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 if (PyUnicode_READY(self) == -1) {
11460 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011461 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462 }
11463
11464 /* Special case for empty strings */
11465 if (PyUnicode_GET_LENGTH(self) == 0)
11466 return 0;
11467 kind = PyUnicode_KIND(self);
11468 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011469
11470 /* PEP 3131 says that the first character must be in
11471 XID_Start and subsequent characters in XID_Continue,
11472 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011473 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011474 letters, digits, underscore). However, given the current
11475 definition of XID_Start and XID_Continue, it is sufficient
11476 to check just for these, except that _ must be allowed
11477 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011479 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011480 return 0;
11481
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011482 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011484 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011485 return 1;
11486}
11487
11488PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011489 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011490\n\
11491Return True if S is a valid identifier according\n\
11492to the language definition.");
11493
11494static PyObject*
11495unicode_isidentifier(PyObject *self)
11496{
11497 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11498}
11499
Georg Brandl559e5d72008-06-11 18:37:52 +000011500PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011501 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011502\n\
11503Return True if all characters in S are considered\n\
11504printable in repr() or S is empty, False otherwise.");
11505
11506static PyObject*
11507unicode_isprintable(PyObject *self)
11508{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 Py_ssize_t i, length;
11510 int kind;
11511 void *data;
11512
11513 if (PyUnicode_READY(self) == -1)
11514 return NULL;
11515 length = PyUnicode_GET_LENGTH(self);
11516 kind = PyUnicode_KIND(self);
11517 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011518
11519 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 if (length == 1)
11521 return PyBool_FromLong(
11522 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 for (i = 0; i < length; i++) {
11525 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011526 Py_RETURN_FALSE;
11527 }
11528 }
11529 Py_RETURN_TRUE;
11530}
11531
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011532PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011533 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534\n\
11535Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011536iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537
11538static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011539unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011541 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542}
11543
Martin v. Löwis18e16552006-02-15 17:27:45 +000011544static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011545unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 if (PyUnicode_READY(self) == -1)
11548 return -1;
11549 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550}
11551
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011552PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011553 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011555Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011556done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557
11558static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011559unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011561 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011562 Py_UCS4 fillchar = ' ';
11563
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011564 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565 return NULL;
11566
Benjamin Petersonbac79492012-01-14 13:34:47 -050011567 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011568 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569
Victor Stinnerc4b49542011-12-11 22:44:26 +010011570 if (PyUnicode_GET_LENGTH(self) >= width)
11571 return unicode_result_unchanged(self);
11572
11573 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574}
11575
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011576PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011577 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011579Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580
11581static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011582unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011584 if (PyUnicode_READY(self) == -1)
11585 return NULL;
11586 if (PyUnicode_IS_ASCII(self))
11587 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011588 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589}
11590
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011591#define LEFTSTRIP 0
11592#define RIGHTSTRIP 1
11593#define BOTHSTRIP 2
11594
11595/* Arrays indexed by above */
11596static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11597
11598#define STRIPNAME(i) (stripformat[i]+3)
11599
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011600/* externally visible for str.strip(unicode) */
11601PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011602_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011603{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011604 void *data;
11605 int kind;
11606 Py_ssize_t i, j, len;
11607 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11610 return NULL;
11611
11612 kind = PyUnicode_KIND(self);
11613 data = PyUnicode_DATA(self);
11614 len = PyUnicode_GET_LENGTH(self);
11615 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11616 PyUnicode_DATA(sepobj),
11617 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011618
Benjamin Peterson14339b62009-01-31 16:36:08 +000011619 i = 0;
11620 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621 while (i < len &&
11622 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011623 i++;
11624 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011625 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011626
Benjamin Peterson14339b62009-01-31 16:36:08 +000011627 j = len;
11628 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011629 do {
11630 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 } while (j >= i &&
11632 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011634 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011635
Victor Stinner7931d9a2011-11-04 00:22:48 +010011636 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637}
11638
11639PyObject*
11640PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11641{
11642 unsigned char *data;
11643 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011644 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645
Victor Stinnerde636f32011-10-01 03:55:54 +020011646 if (PyUnicode_READY(self) == -1)
11647 return NULL;
11648
Victor Stinner684d5fd2012-05-03 02:32:34 +020011649 length = PyUnicode_GET_LENGTH(self);
11650 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011651
Victor Stinner684d5fd2012-05-03 02:32:34 +020011652 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011653 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654
Victor Stinnerde636f32011-10-01 03:55:54 +020011655 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011656 PyErr_SetString(PyExc_IndexError, "string index out of range");
11657 return NULL;
11658 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011659 if (start >= length || end < start)
11660 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011661
Victor Stinner684d5fd2012-05-03 02:32:34 +020011662 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011663 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011664 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011665 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011666 }
11667 else {
11668 kind = PyUnicode_KIND(self);
11669 data = PyUnicode_1BYTE_DATA(self);
11670 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011671 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011672 length);
11673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675
11676static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011677do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 int kind;
11680 void *data;
11681 Py_ssize_t len, i, j;
11682
11683 if (PyUnicode_READY(self) == -1)
11684 return NULL;
11685
11686 kind = PyUnicode_KIND(self);
11687 data = PyUnicode_DATA(self);
11688 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011689
Benjamin Peterson14339b62009-01-31 16:36:08 +000011690 i = 0;
11691 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011693 i++;
11694 }
11695 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011696
Benjamin Peterson14339b62009-01-31 16:36:08 +000011697 j = len;
11698 if (striptype != LEFTSTRIP) {
11699 do {
11700 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011702 j++;
11703 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011704
Victor Stinner7931d9a2011-11-04 00:22:48 +010011705 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706}
11707
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011708
11709static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011710do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011711{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011712 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011713
Benjamin Peterson14339b62009-01-31 16:36:08 +000011714 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11715 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011716
Benjamin Peterson14339b62009-01-31 16:36:08 +000011717 if (sep != NULL && sep != Py_None) {
11718 if (PyUnicode_Check(sep))
11719 return _PyUnicode_XStrip(self, striptype, sep);
11720 else {
11721 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011722 "%s arg must be None or str",
11723 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011724 return NULL;
11725 }
11726 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011727
Benjamin Peterson14339b62009-01-31 16:36:08 +000011728 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011729}
11730
11731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011732PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011733 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011734\n\
11735Return a copy of the string S with leading and trailing\n\
11736whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011737If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011738
11739static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011740unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011741{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011742 if (PyTuple_GET_SIZE(args) == 0)
11743 return do_strip(self, BOTHSTRIP); /* Common case */
11744 else
11745 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011746}
11747
11748
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011749PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011750 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011751\n\
11752Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011753If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011754
11755static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011756unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011757{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011758 if (PyTuple_GET_SIZE(args) == 0)
11759 return do_strip(self, LEFTSTRIP); /* Common case */
11760 else
11761 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011762}
11763
11764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011765PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011766 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011767\n\
11768Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011769If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011770
11771static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011772unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011773{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011774 if (PyTuple_GET_SIZE(args) == 0)
11775 return do_strip(self, RIGHTSTRIP); /* Common case */
11776 else
11777 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011778}
11779
11780
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011782unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011784 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786
Serhiy Storchaka05997252013-01-26 12:14:02 +020011787 if (len < 1)
11788 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789
Victor Stinnerc4b49542011-12-11 22:44:26 +010011790 /* no repeat, return original string */
11791 if (len == 1)
11792 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011793
Benjamin Petersonbac79492012-01-14 13:34:47 -050011794 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 return NULL;
11796
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011797 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011798 PyErr_SetString(PyExc_OverflowError,
11799 "repeated string is too long");
11800 return NULL;
11801 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011803
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011804 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805 if (!u)
11806 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011807 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 if (PyUnicode_GET_LENGTH(str) == 1) {
11810 const int kind = PyUnicode_KIND(str);
11811 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011812 if (kind == PyUnicode_1BYTE_KIND) {
11813 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011814 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011815 }
11816 else if (kind == PyUnicode_2BYTE_KIND) {
11817 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011818 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011819 ucs2[n] = fill_char;
11820 } else {
11821 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11822 assert(kind == PyUnicode_4BYTE_KIND);
11823 for (n = 0; n < len; ++n)
11824 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 }
11827 else {
11828 /* number of characters copied this far */
11829 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011830 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 char *to = (char *) PyUnicode_DATA(u);
11832 Py_MEMCPY(to, PyUnicode_DATA(str),
11833 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011834 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 n = (done <= nchars-done) ? done : nchars-done;
11836 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011837 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011838 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839 }
11840
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011841 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011842 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843}
11844
Alexander Belopolsky40018472011-02-26 01:02:56 +000011845PyObject *
11846PyUnicode_Replace(PyObject *obj,
11847 PyObject *subobj,
11848 PyObject *replobj,
11849 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850{
11851 PyObject *self;
11852 PyObject *str1;
11853 PyObject *str2;
11854 PyObject *result;
11855
11856 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011857 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011858 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011860 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011861 Py_DECREF(self);
11862 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863 }
11864 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011865 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011866 Py_DECREF(self);
11867 Py_DECREF(str1);
11868 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011870 if (PyUnicode_READY(self) == -1 ||
11871 PyUnicode_READY(str1) == -1 ||
11872 PyUnicode_READY(str2) == -1)
11873 result = NULL;
11874 else
11875 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876 Py_DECREF(self);
11877 Py_DECREF(str1);
11878 Py_DECREF(str2);
11879 return result;
11880}
11881
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011882PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011883 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884\n\
11885Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011886old replaced by new. If the optional argument count is\n\
11887given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888
11889static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 PyObject *str1;
11893 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011894 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895 PyObject *result;
11896
Martin v. Löwis18e16552006-02-15 17:27:45 +000011897 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011899 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011900 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011902 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 return NULL;
11904 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011905 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 Py_DECREF(str1);
11907 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011908 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011909 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11910 result = NULL;
11911 else
11912 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913
11914 Py_DECREF(str1);
11915 Py_DECREF(str2);
11916 return result;
11917}
11918
Alexander Belopolsky40018472011-02-26 01:02:56 +000011919static PyObject *
11920unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011922 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 Py_ssize_t isize;
11924 Py_ssize_t osize, squote, dquote, i, o;
11925 Py_UCS4 max, quote;
11926 int ikind, okind;
11927 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011930 return NULL;
11931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011932 isize = PyUnicode_GET_LENGTH(unicode);
11933 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 /* Compute length of output, quote characters, and
11936 maximum character */
11937 osize = 2; /* quotes */
11938 max = 127;
11939 squote = dquote = 0;
11940 ikind = PyUnicode_KIND(unicode);
11941 for (i = 0; i < isize; i++) {
11942 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11943 switch (ch) {
11944 case '\'': squote++; osize++; break;
11945 case '"': dquote++; osize++; break;
11946 case '\\': case '\t': case '\r': case '\n':
11947 osize += 2; break;
11948 default:
11949 /* Fast-path ASCII */
11950 if (ch < ' ' || ch == 0x7f)
11951 osize += 4; /* \xHH */
11952 else if (ch < 0x7f)
11953 osize++;
11954 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11955 osize++;
11956 max = ch > max ? ch : max;
11957 }
11958 else if (ch < 0x100)
11959 osize += 4; /* \xHH */
11960 else if (ch < 0x10000)
11961 osize += 6; /* \uHHHH */
11962 else
11963 osize += 10; /* \uHHHHHHHH */
11964 }
11965 }
11966
11967 quote = '\'';
11968 if (squote) {
11969 if (dquote)
11970 /* Both squote and dquote present. Use squote,
11971 and escape them */
11972 osize += squote;
11973 else
11974 quote = '"';
11975 }
11976
11977 repr = PyUnicode_New(osize, max);
11978 if (repr == NULL)
11979 return NULL;
11980 okind = PyUnicode_KIND(repr);
11981 odata = PyUnicode_DATA(repr);
11982
11983 PyUnicode_WRITE(okind, odata, 0, quote);
11984 PyUnicode_WRITE(okind, odata, osize-1, quote);
11985
11986 for (i = 0, o = 1; i < isize; i++) {
11987 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011988
11989 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 if ((ch == quote) || (ch == '\\')) {
11991 PyUnicode_WRITE(okind, odata, o++, '\\');
11992 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011993 continue;
11994 }
11995
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011997 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 PyUnicode_WRITE(okind, odata, o++, '\\');
11999 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012000 }
12001 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 PyUnicode_WRITE(okind, odata, o++, '\\');
12003 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012004 }
12005 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 PyUnicode_WRITE(okind, odata, o++, '\\');
12007 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012008 }
12009
12010 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012011 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 PyUnicode_WRITE(okind, odata, o++, '\\');
12013 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012014 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12015 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012016 }
12017
Georg Brandl559e5d72008-06-11 18:37:52 +000012018 /* Copy ASCII characters as-is */
12019 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012021 }
12022
Benjamin Peterson29060642009-01-31 22:14:21 +000012023 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012024 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012025 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012026 (categories Z* and C* except ASCII space)
12027 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012029 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000012030 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012031 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012032 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012033 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12034 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012035 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012036 /* Map 16-bit characters to '\uxxxx' */
12037 else if (ch <= 0xffff) {
12038 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012039 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12040 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12041 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12042 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012043 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012044 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012045 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012046 PyUnicode_WRITE(okind, odata, o++, 'U');
12047 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12048 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12049 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12050 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020012051 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12052 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12053 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12054 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012055 }
12056 }
12057 /* Copy characters as-is */
12058 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012059 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012060 }
12061 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012062 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012064 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012065 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066}
12067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012068PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012069 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070\n\
12071Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012072such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073arguments start and end are interpreted as in slice notation.\n\
12074\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012075Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076
12077static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012080 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012081 Py_ssize_t start;
12082 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012083 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084
Jesus Ceaac451502011-04-20 17:09:23 +020012085 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12086 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012087 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012089 if (PyUnicode_READY(self) == -1)
12090 return NULL;
12091 if (PyUnicode_READY(substring) == -1)
12092 return NULL;
12093
Victor Stinner7931d9a2011-11-04 00:22:48 +010012094 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095
12096 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012098 if (result == -2)
12099 return NULL;
12100
Christian Heimes217cfd12007-12-02 14:31:20 +000012101 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102}
12103
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012104PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012105 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012107Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108
12109static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012110unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012112 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012113 Py_ssize_t start;
12114 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012115 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116
Jesus Ceaac451502011-04-20 17:09:23 +020012117 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12118 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012119 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 if (PyUnicode_READY(self) == -1)
12122 return NULL;
12123 if (PyUnicode_READY(substring) == -1)
12124 return NULL;
12125
Victor Stinner7931d9a2011-11-04 00:22:48 +010012126 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127
12128 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 if (result == -2)
12131 return NULL;
12132
Guido van Rossumd57fd912000-03-10 22:53:23 +000012133 if (result < 0) {
12134 PyErr_SetString(PyExc_ValueError, "substring not found");
12135 return NULL;
12136 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137
Christian Heimes217cfd12007-12-02 14:31:20 +000012138 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139}
12140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012141PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012142 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012144Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012145done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146
12147static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012148unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012150 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012151 Py_UCS4 fillchar = ' ';
12152
Victor Stinnere9a29352011-10-01 02:14:59 +020012153 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012155
Benjamin Petersonbac79492012-01-14 13:34:47 -050012156 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157 return NULL;
12158
Victor Stinnerc4b49542011-12-11 22:44:26 +010012159 if (PyUnicode_GET_LENGTH(self) >= width)
12160 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161
Victor Stinnerc4b49542011-12-11 22:44:26 +010012162 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163}
12164
Alexander Belopolsky40018472011-02-26 01:02:56 +000012165PyObject *
12166PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167{
12168 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012169
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170 s = PyUnicode_FromObject(s);
12171 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012172 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012173 if (sep != NULL) {
12174 sep = PyUnicode_FromObject(sep);
12175 if (sep == NULL) {
12176 Py_DECREF(s);
12177 return NULL;
12178 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179 }
12180
Victor Stinner9310abb2011-10-05 00:59:23 +020012181 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182
12183 Py_DECREF(s);
12184 Py_XDECREF(sep);
12185 return result;
12186}
12187
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012188PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012189 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190\n\
12191Return a list of the words in S, using sep as the\n\
12192delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012193splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012194whitespace string is a separator and empty strings are\n\
12195removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012196
12197static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012198unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012199{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012200 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012202 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012204 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12205 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206 return NULL;
12207
12208 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012209 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012211 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012213 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214}
12215
Thomas Wouters477c8d52006-05-27 19:21:47 +000012216PyObject *
12217PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12218{
12219 PyObject* str_obj;
12220 PyObject* sep_obj;
12221 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 int kind1, kind2, kind;
12223 void *buf1 = NULL, *buf2 = NULL;
12224 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012225
12226 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012227 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012228 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012229 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012230 if (!sep_obj) {
12231 Py_DECREF(str_obj);
12232 return NULL;
12233 }
12234 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12235 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012236 Py_DECREF(str_obj);
12237 return NULL;
12238 }
12239
Victor Stinner14f8f022011-10-05 20:58:25 +020012240 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012241 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012242 kind = Py_MAX(kind1, kind2);
12243 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012244 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012245 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012246 if (!buf1)
12247 goto onError;
12248 buf2 = PyUnicode_DATA(sep_obj);
12249 if (kind2 != kind)
12250 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12251 if (!buf2)
12252 goto onError;
12253 len1 = PyUnicode_GET_LENGTH(str_obj);
12254 len2 = PyUnicode_GET_LENGTH(sep_obj);
12255
Benjamin Petersonead6b532011-12-20 17:23:42 -060012256 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012258 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12259 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12260 else
12261 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012262 break;
12263 case PyUnicode_2BYTE_KIND:
12264 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12265 break;
12266 case PyUnicode_4BYTE_KIND:
12267 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12268 break;
12269 default:
12270 assert(0);
12271 out = 0;
12272 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012273
12274 Py_DECREF(sep_obj);
12275 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012276 if (kind1 != kind)
12277 PyMem_Free(buf1);
12278 if (kind2 != kind)
12279 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012280
12281 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012282 onError:
12283 Py_DECREF(sep_obj);
12284 Py_DECREF(str_obj);
12285 if (kind1 != kind && buf1)
12286 PyMem_Free(buf1);
12287 if (kind2 != kind && buf2)
12288 PyMem_Free(buf2);
12289 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012290}
12291
12292
12293PyObject *
12294PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12295{
12296 PyObject* str_obj;
12297 PyObject* sep_obj;
12298 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 int kind1, kind2, kind;
12300 void *buf1 = NULL, *buf2 = NULL;
12301 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012302
12303 str_obj = PyUnicode_FromObject(str_in);
12304 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012305 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012306 sep_obj = PyUnicode_FromObject(sep_in);
12307 if (!sep_obj) {
12308 Py_DECREF(str_obj);
12309 return NULL;
12310 }
12311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 kind1 = PyUnicode_KIND(str_in);
12313 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012314 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 buf1 = PyUnicode_DATA(str_in);
12316 if (kind1 != kind)
12317 buf1 = _PyUnicode_AsKind(str_in, kind);
12318 if (!buf1)
12319 goto onError;
12320 buf2 = PyUnicode_DATA(sep_obj);
12321 if (kind2 != kind)
12322 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12323 if (!buf2)
12324 goto onError;
12325 len1 = PyUnicode_GET_LENGTH(str_obj);
12326 len2 = PyUnicode_GET_LENGTH(sep_obj);
12327
Benjamin Petersonead6b532011-12-20 17:23:42 -060012328 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012330 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12331 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12332 else
12333 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 break;
12335 case PyUnicode_2BYTE_KIND:
12336 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12337 break;
12338 case PyUnicode_4BYTE_KIND:
12339 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12340 break;
12341 default:
12342 assert(0);
12343 out = 0;
12344 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012345
12346 Py_DECREF(sep_obj);
12347 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348 if (kind1 != kind)
12349 PyMem_Free(buf1);
12350 if (kind2 != kind)
12351 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012352
12353 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 onError:
12355 Py_DECREF(sep_obj);
12356 Py_DECREF(str_obj);
12357 if (kind1 != kind && buf1)
12358 PyMem_Free(buf1);
12359 if (kind2 != kind && buf2)
12360 PyMem_Free(buf2);
12361 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012362}
12363
12364PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012365 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012366\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012367Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012368the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012369found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012370
12371static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012372unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012373{
Victor Stinner9310abb2011-10-05 00:59:23 +020012374 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012375}
12376
12377PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012378 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012379\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012380Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012381the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012382separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012383
12384static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012385unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012386{
Victor Stinner9310abb2011-10-05 00:59:23 +020012387 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012388}
12389
Alexander Belopolsky40018472011-02-26 01:02:56 +000012390PyObject *
12391PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012392{
12393 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012394
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012395 s = PyUnicode_FromObject(s);
12396 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012397 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012398 if (sep != NULL) {
12399 sep = PyUnicode_FromObject(sep);
12400 if (sep == NULL) {
12401 Py_DECREF(s);
12402 return NULL;
12403 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012404 }
12405
Victor Stinner9310abb2011-10-05 00:59:23 +020012406 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012407
12408 Py_DECREF(s);
12409 Py_XDECREF(sep);
12410 return result;
12411}
12412
12413PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012414 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012415\n\
12416Return a list of the words in S, using sep as the\n\
12417delimiter string, starting at the end of the string and\n\
12418working to the front. If maxsplit is given, at most maxsplit\n\
12419splits are done. If sep is not specified, any whitespace string\n\
12420is a separator.");
12421
12422static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012423unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012424{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012425 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012426 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012427 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012428
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012429 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12430 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012431 return NULL;
12432
12433 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012434 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012435 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012436 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012437 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012438 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012439}
12440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012441PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012442 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012443\n\
12444Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012445Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012446is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447
12448static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012449unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012450{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012451 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012452 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012453
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012454 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12455 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012456 return NULL;
12457
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012458 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012459}
12460
12461static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012462PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012464 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012465}
12466
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012467PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012468 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012469\n\
12470Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012471and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472
12473static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012474unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012476 if (PyUnicode_READY(self) == -1)
12477 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012478 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012479}
12480
Georg Brandlceee0772007-11-27 23:48:05 +000012481PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012482 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012483\n\
12484Return a translation table usable for str.translate().\n\
12485If there is only one argument, it must be a dictionary mapping Unicode\n\
12486ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012487Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012488If there are two arguments, they must be strings of equal length, and\n\
12489in the resulting dictionary, each character in x will be mapped to the\n\
12490character at the same position in y. If there is a third argument, it\n\
12491must be a string, whose characters will be mapped to None in the result.");
12492
12493static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012494unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012495{
12496 PyObject *x, *y = NULL, *z = NULL;
12497 PyObject *new = NULL, *key, *value;
12498 Py_ssize_t i = 0;
12499 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012500
Georg Brandlceee0772007-11-27 23:48:05 +000012501 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12502 return NULL;
12503 new = PyDict_New();
12504 if (!new)
12505 return NULL;
12506 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507 int x_kind, y_kind, z_kind;
12508 void *x_data, *y_data, *z_data;
12509
Georg Brandlceee0772007-11-27 23:48:05 +000012510 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012511 if (!PyUnicode_Check(x)) {
12512 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12513 "be a string if there is a second argument");
12514 goto err;
12515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012516 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012517 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12518 "arguments must have equal length");
12519 goto err;
12520 }
12521 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012522 x_kind = PyUnicode_KIND(x);
12523 y_kind = PyUnicode_KIND(y);
12524 x_data = PyUnicode_DATA(x);
12525 y_data = PyUnicode_DATA(y);
12526 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12527 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012528 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012529 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012530 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012531 if (!value) {
12532 Py_DECREF(key);
12533 goto err;
12534 }
Georg Brandlceee0772007-11-27 23:48:05 +000012535 res = PyDict_SetItem(new, key, value);
12536 Py_DECREF(key);
12537 Py_DECREF(value);
12538 if (res < 0)
12539 goto err;
12540 }
12541 /* create entries for deleting chars in z */
12542 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 z_kind = PyUnicode_KIND(z);
12544 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012545 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012547 if (!key)
12548 goto err;
12549 res = PyDict_SetItem(new, key, Py_None);
12550 Py_DECREF(key);
12551 if (res < 0)
12552 goto err;
12553 }
12554 }
12555 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 int kind;
12557 void *data;
12558
Georg Brandlceee0772007-11-27 23:48:05 +000012559 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012560 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012561 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12562 "to maketrans it must be a dict");
12563 goto err;
12564 }
12565 /* copy entries into the new dict, converting string keys to int keys */
12566 while (PyDict_Next(x, &i, &key, &value)) {
12567 if (PyUnicode_Check(key)) {
12568 /* convert string keys to integer keys */
12569 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012570 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012571 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12572 "table must be of length 1");
12573 goto err;
12574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 kind = PyUnicode_KIND(key);
12576 data = PyUnicode_DATA(key);
12577 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012578 if (!newkey)
12579 goto err;
12580 res = PyDict_SetItem(new, newkey, value);
12581 Py_DECREF(newkey);
12582 if (res < 0)
12583 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012584 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012585 /* just keep integer keys */
12586 if (PyDict_SetItem(new, key, value) < 0)
12587 goto err;
12588 } else {
12589 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12590 "be strings or integers");
12591 goto err;
12592 }
12593 }
12594 }
12595 return new;
12596 err:
12597 Py_DECREF(new);
12598 return NULL;
12599}
12600
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012601PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012602 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603\n\
12604Return a copy of the string S, where all characters have been mapped\n\
12605through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012606Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012607Unmapped characters are left untouched. Characters mapped to None\n\
12608are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609
12610static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012612{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614}
12615
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012616PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012617 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012619Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620
12621static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012622unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012624 if (PyUnicode_READY(self) == -1)
12625 return NULL;
12626 if (PyUnicode_IS_ASCII(self))
12627 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012628 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629}
12630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012631PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012632 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012634Pad a numeric string S with zeros on the left, to fill a field\n\
12635of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636
12637static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012638unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012639{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012640 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012641 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012642 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 int kind;
12644 void *data;
12645 Py_UCS4 chr;
12646
Martin v. Löwis18e16552006-02-15 17:27:45 +000012647 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012648 return NULL;
12649
Benjamin Petersonbac79492012-01-14 13:34:47 -050012650 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012651 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652
Victor Stinnerc4b49542011-12-11 22:44:26 +010012653 if (PyUnicode_GET_LENGTH(self) >= width)
12654 return unicode_result_unchanged(self);
12655
12656 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657
12658 u = pad(self, fill, 0, '0');
12659
Walter Dörwald068325e2002-04-15 13:36:47 +000012660 if (u == NULL)
12661 return NULL;
12662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 kind = PyUnicode_KIND(u);
12664 data = PyUnicode_DATA(u);
12665 chr = PyUnicode_READ(kind, data, fill);
12666
12667 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012668 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 PyUnicode_WRITE(kind, data, 0, chr);
12670 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671 }
12672
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012673 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012674 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012676
12677#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012678static PyObject *
12679unicode__decimal2ascii(PyObject *self)
12680{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012682}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012683#endif
12684
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012685PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012686 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012688Return True if S starts with the specified prefix, False otherwise.\n\
12689With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012690With optional end, stop comparing S at that position.\n\
12691prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692
12693static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012694unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012695 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012697 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012698 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012699 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012700 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012701 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012702
Jesus Ceaac451502011-04-20 17:09:23 +020012703 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012704 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012705 if (PyTuple_Check(subobj)) {
12706 Py_ssize_t i;
12707 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012708 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012709 if (substring == NULL)
12710 return NULL;
12711 result = tailmatch(self, substring, start, end, -1);
12712 Py_DECREF(substring);
12713 if (result) {
12714 Py_RETURN_TRUE;
12715 }
12716 }
12717 /* nothing matched */
12718 Py_RETURN_FALSE;
12719 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012720 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012721 if (substring == NULL) {
12722 if (PyErr_ExceptionMatches(PyExc_TypeError))
12723 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12724 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012725 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012726 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012727 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012728 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012729 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012730}
12731
12732
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012733PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012734 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012735\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012736Return True if S ends with the specified suffix, False otherwise.\n\
12737With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012738With optional end, stop comparing S at that position.\n\
12739suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740
12741static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012742unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012743 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012744{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012745 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012746 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012747 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012748 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012749 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012750
Jesus Ceaac451502011-04-20 17:09:23 +020012751 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012752 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012753 if (PyTuple_Check(subobj)) {
12754 Py_ssize_t i;
12755 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012756 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012757 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012758 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012759 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012760 result = tailmatch(self, substring, start, end, +1);
12761 Py_DECREF(substring);
12762 if (result) {
12763 Py_RETURN_TRUE;
12764 }
12765 }
12766 Py_RETURN_FALSE;
12767 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012768 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012769 if (substring == NULL) {
12770 if (PyErr_ExceptionMatches(PyExc_TypeError))
12771 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12772 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012773 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012774 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012775 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012776 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012777 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012778}
12779
Victor Stinner202fdca2012-05-07 12:47:02 +020012780Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012781_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012782{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012783 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012784 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12785 writer->data = PyUnicode_DATA(writer->buffer);
12786 writer->kind = PyUnicode_KIND(writer->buffer);
12787}
12788
Victor Stinnerd3f08822012-05-29 12:57:52 +020012789void
12790_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012791{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012792 memset(writer, 0, sizeof(*writer));
12793#ifdef Py_DEBUG
12794 writer->kind = 5; /* invalid kind */
12795#endif
12796 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012797 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012798}
12799
Victor Stinnerd3f08822012-05-29 12:57:52 +020012800int
12801_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12802 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012803{
12804 Py_ssize_t newlen;
12805 PyObject *newbuffer;
12806
Victor Stinnerd3f08822012-05-29 12:57:52 +020012807 assert(length > 0);
12808
Victor Stinner202fdca2012-05-07 12:47:02 +020012809 if (length > PY_SSIZE_T_MAX - writer->pos) {
12810 PyErr_NoMemory();
12811 return -1;
12812 }
12813 newlen = writer->pos + length;
12814
Victor Stinnerd3f08822012-05-29 12:57:52 +020012815 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012816 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012817 /* overallocate 25% to limit the number of resize */
12818 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12819 newlen += newlen / 4;
12820 if (newlen < writer->min_length)
12821 newlen = writer->min_length;
12822 }
12823 writer->buffer = PyUnicode_New(newlen, maxchar);
12824 if (writer->buffer == NULL)
12825 return -1;
12826 _PyUnicodeWriter_Update(writer);
12827 return 0;
12828 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012829
Victor Stinnerd3f08822012-05-29 12:57:52 +020012830 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012831 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012832 /* overallocate 25% to limit the number of resize */
12833 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12834 newlen += newlen / 4;
12835 if (newlen < writer->min_length)
12836 newlen = writer->min_length;
12837 }
12838
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012839 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012840 /* resize + widen */
12841 newbuffer = PyUnicode_New(newlen, maxchar);
12842 if (newbuffer == NULL)
12843 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012844 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12845 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012846 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012847 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012848 }
12849 else {
12850 newbuffer = resize_compact(writer->buffer, newlen);
12851 if (newbuffer == NULL)
12852 return -1;
12853 }
12854 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012855 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012856 }
12857 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012858 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012859 newbuffer = PyUnicode_New(writer->size, maxchar);
12860 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012861 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012862 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12863 writer->buffer, 0, writer->pos);
12864 Py_DECREF(writer->buffer);
12865 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012866 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012867 }
12868 return 0;
12869}
12870
Victor Stinnerd3f08822012-05-29 12:57:52 +020012871int
12872_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12873{
12874 Py_UCS4 maxchar;
12875 Py_ssize_t len;
12876
12877 if (PyUnicode_READY(str) == -1)
12878 return -1;
12879 len = PyUnicode_GET_LENGTH(str);
12880 if (len == 0)
12881 return 0;
12882 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12883 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012884 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012885 Py_INCREF(str);
12886 writer->buffer = str;
12887 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012888 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012889 writer->size = 0;
12890 writer->pos += len;
12891 return 0;
12892 }
12893 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12894 return -1;
12895 }
12896 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12897 str, 0, len);
12898 writer->pos += len;
12899 return 0;
12900}
12901
12902PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012903_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012904{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012905 if (writer->pos == 0) {
12906 Py_XDECREF(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020012907 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020012908 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012909 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012910 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12911 return writer->buffer;
12912 }
12913 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12914 PyObject *newbuffer;
12915 newbuffer = resize_compact(writer->buffer, writer->pos);
12916 if (newbuffer == NULL) {
12917 Py_DECREF(writer->buffer);
12918 return NULL;
12919 }
12920 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012921 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012922 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012923 return writer->buffer;
12924}
12925
Victor Stinnerd3f08822012-05-29 12:57:52 +020012926void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012927_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012928{
12929 Py_CLEAR(writer->buffer);
12930}
12931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012932#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012933
12934PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012935 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012936\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012937Return a formatted version of S, using substitutions from args and kwargs.\n\
12938The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012939
Eric Smith27bbca62010-11-04 17:06:58 +000012940PyDoc_STRVAR(format_map__doc__,
12941 "S.format_map(mapping) -> str\n\
12942\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012943Return a formatted version of S, using substitutions from mapping.\n\
12944The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012945
Eric Smith4a7d76d2008-05-30 18:10:19 +000012946static PyObject *
12947unicode__format__(PyObject* self, PyObject* args)
12948{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012949 PyObject *format_spec;
12950 _PyUnicodeWriter writer;
12951 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012952
12953 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12954 return NULL;
12955
Victor Stinnerd3f08822012-05-29 12:57:52 +020012956 if (PyUnicode_READY(self) == -1)
12957 return NULL;
12958 _PyUnicodeWriter_Init(&writer, 0);
12959 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12960 self, format_spec, 0,
12961 PyUnicode_GET_LENGTH(format_spec));
12962 if (ret == -1) {
12963 _PyUnicodeWriter_Dealloc(&writer);
12964 return NULL;
12965 }
12966 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012967}
12968
Eric Smith8c663262007-08-25 02:26:07 +000012969PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012970 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012971\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012972Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012973
12974static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012975unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012976{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012977 Py_ssize_t size;
12978
12979 /* If it's a compact object, account for base structure +
12980 character data. */
12981 if (PyUnicode_IS_COMPACT_ASCII(v))
12982 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12983 else if (PyUnicode_IS_COMPACT(v))
12984 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012985 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 else {
12987 /* If it is a two-block object, account for base object, and
12988 for character block if present. */
12989 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012990 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012992 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012993 }
12994 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012995 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012996 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012997 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012998 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012999 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000
13001 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013002}
13003
13004PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013005 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013006
13007static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013008unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013009{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013010 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 if (!copy)
13012 return NULL;
13013 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013014}
13015
Guido van Rossumd57fd912000-03-10 22:53:23 +000013016static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013017 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013018 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013019 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13020 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013021 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13022 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013023 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013024 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13025 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13026 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13027 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13028 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013029 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013030 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13031 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13032 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013033 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013034 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13035 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13036 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013037 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013038 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013039 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013040 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013041 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13042 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13043 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13044 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13045 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13046 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13047 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13048 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13049 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13050 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13051 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13052 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13053 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13054 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013055 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013056 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013057 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013058 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013059 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013060 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013061 {"maketrans", (PyCFunction) unicode_maketrans,
13062 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013063 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013064#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013065 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013066 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013067#endif
13068
Benjamin Peterson14339b62009-01-31 16:36:08 +000013069 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013070 {NULL, NULL}
13071};
13072
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013073static PyObject *
13074unicode_mod(PyObject *v, PyObject *w)
13075{
Brian Curtindfc80e32011-08-10 20:28:54 -050013076 if (!PyUnicode_Check(v))
13077 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013078 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013079}
13080
13081static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013082 0, /*nb_add*/
13083 0, /*nb_subtract*/
13084 0, /*nb_multiply*/
13085 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013086};
13087
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013089 (lenfunc) unicode_length, /* sq_length */
13090 PyUnicode_Concat, /* sq_concat */
13091 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13092 (ssizeargfunc) unicode_getitem, /* sq_item */
13093 0, /* sq_slice */
13094 0, /* sq_ass_item */
13095 0, /* sq_ass_slice */
13096 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097};
13098
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013099static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013100unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013101{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013102 if (PyUnicode_READY(self) == -1)
13103 return NULL;
13104
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013105 if (PyIndex_Check(item)) {
13106 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013107 if (i == -1 && PyErr_Occurred())
13108 return NULL;
13109 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013110 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013111 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013112 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013113 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013114 PyObject *result;
13115 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013116 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013117 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013119 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013120 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013121 return NULL;
13122 }
13123
13124 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013125 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013127 slicelength == PyUnicode_GET_LENGTH(self)) {
13128 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013129 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013130 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013131 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013132 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013133 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013134 src_kind = PyUnicode_KIND(self);
13135 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013136 if (!PyUnicode_IS_ASCII(self)) {
13137 kind_limit = kind_maxchar_limit(src_kind);
13138 max_char = 0;
13139 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13140 ch = PyUnicode_READ(src_kind, src_data, cur);
13141 if (ch > max_char) {
13142 max_char = ch;
13143 if (max_char >= kind_limit)
13144 break;
13145 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013146 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013147 }
Victor Stinner55c99112011-10-13 01:17:06 +020013148 else
13149 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013150 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013151 if (result == NULL)
13152 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013153 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013154 dest_data = PyUnicode_DATA(result);
13155
13156 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013157 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13158 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013159 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013160 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013161 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013162 } else {
13163 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13164 return NULL;
13165 }
13166}
13167
13168static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013169 (lenfunc)unicode_length, /* mp_length */
13170 (binaryfunc)unicode_subscript, /* mp_subscript */
13171 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013172};
13173
Guido van Rossumd57fd912000-03-10 22:53:23 +000013174
Guido van Rossumd57fd912000-03-10 22:53:23 +000013175/* Helpers for PyUnicode_Format() */
13176
13177static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013178getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013179{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013180 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013181 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013182 (*p_argidx)++;
13183 if (arglen < 0)
13184 return args;
13185 else
13186 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013187 }
13188 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013189 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013190 return NULL;
13191}
13192
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013193/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013194
Victor Stinnerd3f08822012-05-29 12:57:52 +020013195static int
13196formatfloat(PyObject *v, int flags, int prec, int type,
13197 PyObject **p_output, _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013199 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013200 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013201 Py_ssize_t len;
Tim Petersced69f82003-09-16 20:30:58 +000013202
Guido van Rossumd57fd912000-03-10 22:53:23 +000013203 x = PyFloat_AsDouble(v);
13204 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013205 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013206
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013208 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013209
Eric Smith0923d1d2009-04-16 20:16:10 +000013210 p = PyOS_double_to_string(x, type, prec,
13211 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013212 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013213 return -1;
13214 len = strlen(p);
13215 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013216 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13217 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013218 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013219 }
Victor Stinner184252a2012-06-16 02:57:41 +020013220 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013221 writer->pos += len;
13222 }
13223 else
13224 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013225 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013226 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013227}
13228
Victor Stinnerd0880d52012-04-27 23:40:13 +020013229/* formatlong() emulates the format codes d, u, o, x and X, and
13230 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13231 * Python's regular ints.
13232 * Return value: a new PyUnicodeObject*, or NULL if error.
13233 * The output string is of the form
13234 * "-"? ("0x" | "0X")? digit+
13235 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13236 * set in flags. The case of hex digits will be correct,
13237 * There will be at least prec digits, zero-filled on the left if
13238 * necessary to get that many.
13239 * val object to be converted
13240 * flags bitmask of format flags; only F_ALT is looked at
13241 * prec minimum number of digits; 0-fill on left if needed
13242 * type a character in [duoxX]; u acts the same as d
13243 *
13244 * CAUTION: o, x and X conversions on regular ints can never
13245 * produce a '-' sign, but can for Python's unbounded ints.
13246 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013247static PyObject*
13248formatlong(PyObject *val, int flags, int prec, int type)
13249{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013250 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013251 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013252 Py_ssize_t i;
13253 int sign; /* 1 if '-', else 0 */
13254 int len; /* number of characters */
13255 Py_ssize_t llen;
13256 int numdigits; /* len == numnondigits + numdigits */
13257 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013258
Victor Stinnerd0880d52012-04-27 23:40:13 +020013259 /* Avoid exceeding SSIZE_T_MAX */
13260 if (prec > INT_MAX-3) {
13261 PyErr_SetString(PyExc_OverflowError,
13262 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013263 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013264 }
13265
13266 assert(PyLong_Check(val));
13267
13268 switch (type) {
13269 case 'd':
13270 case 'u':
13271 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013272 if (PyBool_Check(val))
13273 result = PyNumber_ToBase(val, 10);
13274 else
13275 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013276 break;
13277 case 'o':
13278 numnondigits = 2;
13279 result = PyNumber_ToBase(val, 8);
13280 break;
13281 case 'x':
13282 case 'X':
13283 numnondigits = 2;
13284 result = PyNumber_ToBase(val, 16);
13285 break;
13286 default:
13287 assert(!"'type' not in [duoxX]");
13288 }
13289 if (!result)
13290 return NULL;
13291
13292 assert(unicode_modifiable(result));
13293 assert(PyUnicode_IS_READY(result));
13294 assert(PyUnicode_IS_ASCII(result));
13295
13296 /* To modify the string in-place, there can only be one reference. */
13297 if (Py_REFCNT(result) != 1) {
13298 PyErr_BadInternalCall();
13299 return NULL;
13300 }
13301 buf = PyUnicode_DATA(result);
13302 llen = PyUnicode_GET_LENGTH(result);
13303 if (llen > INT_MAX) {
13304 PyErr_SetString(PyExc_ValueError,
13305 "string too large in _PyBytes_FormatLong");
13306 return NULL;
13307 }
13308 len = (int)llen;
13309 sign = buf[0] == '-';
13310 numnondigits += sign;
13311 numdigits = len - numnondigits;
13312 assert(numdigits > 0);
13313
13314 /* Get rid of base marker unless F_ALT */
13315 if (((flags & F_ALT) == 0 &&
13316 (type == 'o' || type == 'x' || type == 'X'))) {
13317 assert(buf[sign] == '0');
13318 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13319 buf[sign+1] == 'o');
13320 numnondigits -= 2;
13321 buf += 2;
13322 len -= 2;
13323 if (sign)
13324 buf[0] = '-';
13325 assert(len == numnondigits + numdigits);
13326 assert(numdigits > 0);
13327 }
13328
13329 /* Fill with leading zeroes to meet minimum width. */
13330 if (prec > numdigits) {
13331 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13332 numnondigits + prec);
13333 char *b1;
13334 if (!r1) {
13335 Py_DECREF(result);
13336 return NULL;
13337 }
13338 b1 = PyBytes_AS_STRING(r1);
13339 for (i = 0; i < numnondigits; ++i)
13340 *b1++ = *buf++;
13341 for (i = 0; i < prec - numdigits; i++)
13342 *b1++ = '0';
13343 for (i = 0; i < numdigits; i++)
13344 *b1++ = *buf++;
13345 *b1 = '\0';
13346 Py_DECREF(result);
13347 result = r1;
13348 buf = PyBytes_AS_STRING(result);
13349 len = numnondigits + prec;
13350 }
13351
13352 /* Fix up case for hex conversions. */
13353 if (type == 'X') {
13354 /* Need to convert all lower case letters to upper case.
13355 and need to convert 0x to 0X (and -0x to -0X). */
13356 for (i = 0; i < len; i++)
13357 if (buf[i] >= 'a' && buf[i] <= 'x')
13358 buf[i] -= 'a'-'A';
13359 }
13360 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13361 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013362 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013363 Py_DECREF(result);
13364 result = unicode;
13365 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013366 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013367}
13368
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013369static Py_UCS4
13370formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013372 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013373 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013374 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013375 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013376 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013377 goto onError;
13378 }
13379 else {
13380 /* Integer input truncated to a character */
13381 long x;
13382 x = PyLong_AsLong(v);
13383 if (x == -1 && PyErr_Occurred())
13384 goto onError;
13385
Victor Stinner8faf8212011-12-08 22:14:11 +010013386 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013387 PyErr_SetString(PyExc_OverflowError,
13388 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013389 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013390 }
13391
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013392 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013393 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013394
Benjamin Peterson29060642009-01-31 22:14:21 +000013395 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013396 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013397 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013398 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013399}
13400
Alexander Belopolsky40018472011-02-26 01:02:56 +000013401PyObject *
13402PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013403{
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013404 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013405 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013406 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013407 PyObject *temp = NULL;
13408 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013409 PyObject *uformat;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013410 void *fmt;
13411 enum PyUnicode_Kind kind, fmtkind;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013412 _PyUnicodeWriter writer;
Victor Stinneree4544c2012-05-09 22:24:08 +020013413 Py_ssize_t sublen;
13414 Py_UCS4 maxchar;
Tim Petersced69f82003-09-16 20:30:58 +000013415
Guido van Rossumd57fd912000-03-10 22:53:23 +000013416 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013417 PyErr_BadInternalCall();
13418 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013419 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013420 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013421 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013422 return NULL;
Victor Stinner19294072012-10-05 00:09:33 +020013423 if (PyUnicode_READY(uformat) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013424 Py_DECREF(uformat);
Victor Stinner19294072012-10-05 00:09:33 +020013425 return NULL;
13426 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013428 fmt = PyUnicode_DATA(uformat);
13429 fmtkind = PyUnicode_KIND(uformat);
13430 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13431 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013432
Victor Stinnerd3f08822012-05-29 12:57:52 +020013433 _PyUnicodeWriter_Init(&writer, fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013434
Guido van Rossumd57fd912000-03-10 22:53:23 +000013435 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013436 arglen = PyTuple_Size(args);
13437 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013438 }
13439 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013440 arglen = -1;
13441 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013442 }
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013443 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013444 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013445
13446 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013447 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013448 Py_ssize_t nonfmtpos;
13449 nonfmtpos = fmtpos++;
13450 while (fmtcnt >= 0 &&
13451 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13452 fmtpos++;
13453 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013454 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013455 if (fmtcnt < 0)
13456 fmtpos--;
Victor Stinneree4544c2012-05-09 22:24:08 +020013457 sublen = fmtpos - nonfmtpos;
13458 maxchar = _PyUnicode_FindMaxChar(uformat,
13459 nonfmtpos, nonfmtpos + sublen);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013460 if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013461 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013462
Victor Stinnerd3f08822012-05-29 12:57:52 +020013463 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13464 uformat, nonfmtpos, sublen);
Victor Stinneree4544c2012-05-09 22:24:08 +020013465 writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013466 }
13467 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013468 /* Got a format specifier */
13469 int flags = 0;
13470 Py_ssize_t width = -1;
13471 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013472 Py_UCS4 c = '\0';
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013473 Py_UCS4 fill;
13474 int sign;
13475 Py_UCS4 signchar;
Benjamin Peterson29060642009-01-31 22:14:21 +000013476 int isnumok;
13477 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013478 void *pbuf = NULL;
13479 Py_ssize_t pindex, len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013480 Py_UCS4 bufmaxchar;
13481 Py_ssize_t buflen;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013482
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013483 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013484 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13485 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013486 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013487 Py_ssize_t keylen;
13488 PyObject *key;
13489 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013490
Benjamin Peterson29060642009-01-31 22:14:21 +000013491 if (dict == NULL) {
13492 PyErr_SetString(PyExc_TypeError,
13493 "format requires a mapping");
13494 goto onError;
13495 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013496 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013497 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013498 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013499 /* Skip over balanced parentheses */
13500 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013501 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13502 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013503 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013504 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013505 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013506 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013508 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013509 if (fmtcnt < 0 || pcount > 0) {
13510 PyErr_SetString(PyExc_ValueError,
13511 "incomplete format key");
13512 goto onError;
13513 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013514 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013515 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013516 if (key == NULL)
13517 goto onError;
13518 if (args_owned) {
13519 Py_DECREF(args);
13520 args_owned = 0;
13521 }
13522 args = PyObject_GetItem(dict, key);
13523 Py_DECREF(key);
13524 if (args == NULL) {
13525 goto onError;
13526 }
13527 args_owned = 1;
13528 arglen = -1;
13529 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013530 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013531 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013532 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13533 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013534 case '-': flags |= F_LJUST; continue;
13535 case '+': flags |= F_SIGN; continue;
13536 case ' ': flags |= F_BLANK; continue;
13537 case '#': flags |= F_ALT; continue;
13538 case '0': flags |= F_ZERO; continue;
13539 }
13540 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013541 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013542 if (c == '*') {
13543 v = getnextarg(args, arglen, &argidx);
13544 if (v == NULL)
13545 goto onError;
13546 if (!PyLong_Check(v)) {
13547 PyErr_SetString(PyExc_TypeError,
13548 "* wants int");
13549 goto onError;
13550 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +020013551 width = PyLong_AsSsize_t(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013552 if (width == -1 && PyErr_Occurred())
13553 goto onError;
13554 if (width < 0) {
13555 flags |= F_LJUST;
13556 width = -width;
13557 }
13558 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013559 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013560 }
13561 else if (c >= '0' && c <= '9') {
13562 width = c - '0';
13563 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013564 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013565 if (c < '0' || c > '9')
13566 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013567 /* Since c is unsigned, the RHS would end up as unsigned,
13568 mixing signed and unsigned comparison. Since c is between
13569 '0' and '9', casting to int is safe. */
13570 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013571 PyErr_SetString(PyExc_ValueError,
13572 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013573 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013574 }
13575 width = width*10 + (c - '0');
13576 }
13577 }
13578 if (c == '.') {
13579 prec = 0;
13580 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013581 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013582 if (c == '*') {
13583 v = getnextarg(args, arglen, &argidx);
13584 if (v == NULL)
13585 goto onError;
13586 if (!PyLong_Check(v)) {
13587 PyErr_SetString(PyExc_TypeError,
13588 "* wants int");
13589 goto onError;
13590 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +020013591 prec = _PyLong_AsInt(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013592 if (prec == -1 && PyErr_Occurred())
13593 goto onError;
13594 if (prec < 0)
13595 prec = 0;
13596 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013597 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013598 }
13599 else if (c >= '0' && c <= '9') {
13600 prec = c - '0';
13601 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013602 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013603 if (c < '0' || c > '9')
13604 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013605 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013606 PyErr_SetString(PyExc_ValueError,
13607 "prec too big");
13608 goto onError;
13609 }
13610 prec = prec*10 + (c - '0');
13611 }
13612 }
13613 } /* prec */
13614 if (fmtcnt >= 0) {
13615 if (c == 'h' || c == 'l' || c == 'L') {
13616 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013617 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013618 }
13619 }
13620 if (fmtcnt < 0) {
13621 PyErr_SetString(PyExc_ValueError,
13622 "incomplete format");
13623 goto onError;
13624 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013625 if (fmtcnt == 0)
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013626 writer.overallocate = 0;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013627
13628 if (c == '%') {
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013629 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013630 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013631 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13632 writer.pos += 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013633 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013634 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013635
Victor Stinneraff3cc62012-04-30 05:19:21 +020013636 v = getnextarg(args, arglen, &argidx);
13637 if (v == NULL)
13638 goto onError;
13639
Benjamin Peterson29060642009-01-31 22:14:21 +000013640 sign = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013641 signchar = '\0';
Benjamin Peterson29060642009-01-31 22:14:21 +000013642 fill = ' ';
13643 switch (c) {
13644
Benjamin Peterson29060642009-01-31 22:14:21 +000013645 case 's':
13646 case 'r':
13647 case 'a':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013648 if (PyLong_CheckExact(v) && width == -1 && prec == -1) {
13649 /* Fast path */
13650 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13651 goto onError;
13652 goto nextarg;
13653 }
13654
Victor Stinner808fc0a2010-03-22 12:50:40 +000013655 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013656 temp = v;
13657 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013658 }
13659 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013660 if (c == 's')
13661 temp = PyObject_Str(v);
13662 else if (c == 'r')
13663 temp = PyObject_Repr(v);
13664 else
13665 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013666 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013667 break;
13668
13669 case 'i':
13670 case 'd':
13671 case 'u':
13672 case 'o':
13673 case 'x':
13674 case 'X':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013675 if (PyLong_CheckExact(v)
13676 && width == -1 && prec == -1
13677 && !(flags & (F_SIGN | F_BLANK)))
13678 {
13679 /* Fast path */
13680 switch(c)
13681 {
13682 case 'd':
13683 case 'i':
13684 case 'u':
13685 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13686 goto onError;
13687 goto nextarg;
13688 case 'x':
13689 if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1)
13690 goto onError;
13691 goto nextarg;
13692 case 'o':
13693 if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1)
13694 goto onError;
13695 goto nextarg;
13696 default:
13697 break;
13698 }
13699 }
13700
Benjamin Peterson29060642009-01-31 22:14:21 +000013701 isnumok = 0;
13702 if (PyNumber_Check(v)) {
13703 PyObject *iobj=NULL;
13704
13705 if (PyLong_Check(v)) {
13706 iobj = v;
13707 Py_INCREF(iobj);
13708 }
13709 else {
13710 iobj = PyNumber_Long(v);
13711 }
13712 if (iobj!=NULL) {
13713 if (PyLong_Check(iobj)) {
13714 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013715 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013716 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013717 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013718 }
13719 else {
13720 Py_DECREF(iobj);
13721 }
13722 }
13723 }
13724 if (!isnumok) {
13725 PyErr_Format(PyExc_TypeError,
13726 "%%%c format: a number is required, "
13727 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13728 goto onError;
13729 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013730 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013731 fill = '0';
13732 break;
13733
13734 case 'e':
13735 case 'E':
13736 case 'f':
13737 case 'F':
13738 case 'g':
13739 case 'G':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013740 if (width == -1 && prec == -1
13741 && !(flags & (F_SIGN | F_BLANK)))
13742 {
13743 /* Fast path */
13744 if (formatfloat(v, flags, prec, c, NULL, &writer) == -1)
13745 goto onError;
13746 goto nextarg;
13747 }
13748
Benjamin Peterson29060642009-01-31 22:14:21 +000013749 sign = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013750 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013751 fill = '0';
Victor Stinnerd3f08822012-05-29 12:57:52 +020013752 if (formatfloat(v, flags, prec, c, &temp, NULL) == -1)
13753 temp = NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000013754 break;
13755
13756 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013757 {
13758 Py_UCS4 ch = formatchar(v);
13759 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013760 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013761 if (width == -1 && prec == -1) {
13762 /* Fast path */
13763 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
13764 goto onError;
13765 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
13766 writer.pos += 1;
13767 goto nextarg;
13768 }
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013769 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000013770 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013771 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013772
13773 default:
13774 PyErr_Format(PyExc_ValueError,
13775 "unsupported format character '%c' (0x%x) "
13776 "at index %zd",
13777 (31<=c && c<=126) ? (char)c : '?',
13778 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013779 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013780 goto onError;
13781 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013782 if (temp == NULL)
13783 goto onError;
13784 assert (PyUnicode_Check(temp));
Victor Stinnerd3f08822012-05-29 12:57:52 +020013785
13786 if (width == -1 && prec == -1
13787 && !(flags & (F_SIGN | F_BLANK)))
13788 {
13789 /* Fast path */
13790 if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1)
13791 goto onError;
13792 goto nextarg;
13793 }
13794
Victor Stinneraff3cc62012-04-30 05:19:21 +020013795 if (PyUnicode_READY(temp) == -1) {
13796 Py_CLEAR(temp);
13797 goto onError;
13798 }
13799 kind = PyUnicode_KIND(temp);
13800 pbuf = PyUnicode_DATA(temp);
13801 len = PyUnicode_GET_LENGTH(temp);
13802
13803 if (c == 's' || c == 'r' || c == 'a') {
13804 if (prec >= 0 && len > prec)
13805 len = prec;
13806 }
13807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013808 /* pbuf is initialized here. */
13809 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013810 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013811 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13812 if (ch == '-' || ch == '+') {
13813 signchar = ch;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013814 len--;
13815 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013816 }
13817 else if (flags & F_SIGN)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013818 signchar = '+';
Benjamin Peterson29060642009-01-31 22:14:21 +000013819 else if (flags & F_BLANK)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013820 signchar = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +000013821 else
13822 sign = 0;
13823 }
13824 if (width < len)
13825 width = len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013826
13827 /* Compute the length and maximum character of the
13828 written characters */
13829 bufmaxchar = 127;
13830 if (!(flags & F_LJUST)) {
13831 if (sign) {
13832 if ((width-1) > len)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013833 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013834 }
13835 else {
13836 if (width > len)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013837 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013838 }
13839 }
13840 maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013841 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
Victor Stinneree4544c2012-05-09 22:24:08 +020013842
13843 buflen = width;
13844 if (sign && len == width)
13845 buflen++;
13846
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013847 if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
Victor Stinneree4544c2012-05-09 22:24:08 +020013848 goto onError;
13849
13850 /* Write characters */
Benjamin Peterson29060642009-01-31 22:14:21 +000013851 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013852 if (fill != ' ') {
Victor Stinneree4544c2012-05-09 22:24:08 +020013853 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13854 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013855 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013856 if (width > len)
13857 width--;
13858 }
13859 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013860 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013861 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013862 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013863 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13864 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13865 writer.pos += 2;
13866 pindex += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +000013867 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013868 width -= 2;
13869 if (width < 0)
13870 width = 0;
13871 len -= 2;
13872 }
13873 if (width > len && !(flags & F_LJUST)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013874 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013875 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
13876 writer.pos += sublen;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013877 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013878 }
13879 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013880 if (sign) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013881 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13882 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013883 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013884 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013885 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13886 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013887 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13888 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13889 writer.pos += 2;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013890 pindex += 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013891 }
13892 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013893
Victor Stinnerc9d369f2012-06-16 02:22:37 +020013894 if (len) {
13895 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13896 temp, pindex, len);
13897 writer.pos += len;
13898 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013899 if (width > len) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013900 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013901 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
13902 writer.pos += sublen;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013903 }
Victor Stinneree4544c2012-05-09 22:24:08 +020013904
Victor Stinnerd3f08822012-05-29 12:57:52 +020013905nextarg:
Benjamin Peterson29060642009-01-31 22:14:21 +000013906 if (dict && (argidx < arglen) && c != '%') {
13907 PyErr_SetString(PyExc_TypeError,
13908 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013909 goto onError;
13910 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013911 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013912 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013913 } /* until end */
13914 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013915 PyErr_SetString(PyExc_TypeError,
13916 "not all arguments converted during string formatting");
13917 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013918 }
13919
13920 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013921 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013922 }
13923 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013924 Py_XDECREF(temp);
13925 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013926 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013927
Benjamin Peterson29060642009-01-31 22:14:21 +000013928 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013929 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013930 Py_XDECREF(temp);
13931 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013932 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013933 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013934 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013935 }
13936 return NULL;
13937}
13938
Jeremy Hylton938ace62002-07-17 16:30:39 +000013939static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013940unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13941
Tim Peters6d6c1a32001-08-02 04:15:00 +000013942static PyObject *
13943unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13944{
Benjamin Peterson29060642009-01-31 22:14:21 +000013945 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013946 static char *kwlist[] = {"object", "encoding", "errors", 0};
13947 char *encoding = NULL;
13948 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013949
Benjamin Peterson14339b62009-01-31 16:36:08 +000013950 if (type != &PyUnicode_Type)
13951 return unicode_subtype_new(type, args, kwds);
13952 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013953 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013954 return NULL;
13955 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020013956 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000013957 if (encoding == NULL && errors == NULL)
13958 return PyObject_Str(x);
13959 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013960 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013961}
13962
Guido van Rossume023fe02001-08-30 03:12:59 +000013963static PyObject *
13964unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13965{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013966 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013967 Py_ssize_t length, char_size;
13968 int share_wstr, share_utf8;
13969 unsigned int kind;
13970 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013971
Benjamin Peterson14339b62009-01-31 16:36:08 +000013972 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013973
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013974 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013975 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013976 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013977 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050013978 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013979 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013980 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013981 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013982
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013983 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013984 if (self == NULL) {
13985 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013986 return NULL;
13987 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013988 kind = PyUnicode_KIND(unicode);
13989 length = PyUnicode_GET_LENGTH(unicode);
13990
13991 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013992#ifdef Py_DEBUG
13993 _PyUnicode_HASH(self) = -1;
13994#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013995 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013996#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013997 _PyUnicode_STATE(self).interned = 0;
13998 _PyUnicode_STATE(self).kind = kind;
13999 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014000 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014001 _PyUnicode_STATE(self).ready = 1;
14002 _PyUnicode_WSTR(self) = NULL;
14003 _PyUnicode_UTF8_LENGTH(self) = 0;
14004 _PyUnicode_UTF8(self) = NULL;
14005 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014006 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014007
14008 share_utf8 = 0;
14009 share_wstr = 0;
14010 if (kind == PyUnicode_1BYTE_KIND) {
14011 char_size = 1;
14012 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14013 share_utf8 = 1;
14014 }
14015 else if (kind == PyUnicode_2BYTE_KIND) {
14016 char_size = 2;
14017 if (sizeof(wchar_t) == 2)
14018 share_wstr = 1;
14019 }
14020 else {
14021 assert(kind == PyUnicode_4BYTE_KIND);
14022 char_size = 4;
14023 if (sizeof(wchar_t) == 4)
14024 share_wstr = 1;
14025 }
14026
14027 /* Ensure we won't overflow the length. */
14028 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14029 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014030 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014031 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014032 data = PyObject_MALLOC((length + 1) * char_size);
14033 if (data == NULL) {
14034 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014035 goto onError;
14036 }
14037
Victor Stinnerc3c74152011-10-02 20:39:55 +020014038 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014039 if (share_utf8) {
14040 _PyUnicode_UTF8_LENGTH(self) = length;
14041 _PyUnicode_UTF8(self) = data;
14042 }
14043 if (share_wstr) {
14044 _PyUnicode_WSTR_LENGTH(self) = length;
14045 _PyUnicode_WSTR(self) = (wchar_t *)data;
14046 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014047
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014048 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014049 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014050 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014051#ifdef Py_DEBUG
14052 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14053#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014054 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014055 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014056
14057onError:
14058 Py_DECREF(unicode);
14059 Py_DECREF(self);
14060 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014061}
14062
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014063PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014064"str(object='') -> str\n\
14065str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014066\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014067Create a new string object from the given object. If encoding or\n\
14068errors is specified, then the object must expose a data buffer\n\
14069that will be decoded using the given encoding and error handler.\n\
14070Otherwise, returns the result of object.__str__() (if defined)\n\
14071or repr(object).\n\
14072encoding defaults to sys.getdefaultencoding().\n\
14073errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014074
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014075static PyObject *unicode_iter(PyObject *seq);
14076
Guido van Rossumd57fd912000-03-10 22:53:23 +000014077PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014078 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014079 "str", /* tp_name */
14080 sizeof(PyUnicodeObject), /* tp_size */
14081 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014082 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014083 (destructor)unicode_dealloc, /* tp_dealloc */
14084 0, /* tp_print */
14085 0, /* tp_getattr */
14086 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014087 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014088 unicode_repr, /* tp_repr */
14089 &unicode_as_number, /* tp_as_number */
14090 &unicode_as_sequence, /* tp_as_sequence */
14091 &unicode_as_mapping, /* tp_as_mapping */
14092 (hashfunc) unicode_hash, /* tp_hash*/
14093 0, /* tp_call*/
14094 (reprfunc) unicode_str, /* tp_str */
14095 PyObject_GenericGetAttr, /* tp_getattro */
14096 0, /* tp_setattro */
14097 0, /* tp_as_buffer */
14098 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014099 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014100 unicode_doc, /* tp_doc */
14101 0, /* tp_traverse */
14102 0, /* tp_clear */
14103 PyUnicode_RichCompare, /* tp_richcompare */
14104 0, /* tp_weaklistoffset */
14105 unicode_iter, /* tp_iter */
14106 0, /* tp_iternext */
14107 unicode_methods, /* tp_methods */
14108 0, /* tp_members */
14109 0, /* tp_getset */
14110 &PyBaseObject_Type, /* tp_base */
14111 0, /* tp_dict */
14112 0, /* tp_descr_get */
14113 0, /* tp_descr_set */
14114 0, /* tp_dictoffset */
14115 0, /* tp_init */
14116 0, /* tp_alloc */
14117 unicode_new, /* tp_new */
14118 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014119};
14120
14121/* Initialize the Unicode implementation */
14122
Victor Stinner3a50e702011-10-18 21:21:00 +020014123int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014124{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014125 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014126 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014127 0x000A, /* LINE FEED */
14128 0x000D, /* CARRIAGE RETURN */
14129 0x001C, /* FILE SEPARATOR */
14130 0x001D, /* GROUP SEPARATOR */
14131 0x001E, /* RECORD SEPARATOR */
14132 0x0085, /* NEXT LINE */
14133 0x2028, /* LINE SEPARATOR */
14134 0x2029, /* PARAGRAPH SEPARATOR */
14135 };
14136
Fred Drakee4315f52000-05-09 19:53:39 +000014137 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014138 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014139 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014140 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014141 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014142
Guido van Rossumcacfc072002-05-24 19:01:59 +000014143 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014144 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014145
14146 /* initialize the linebreak bloom filter */
14147 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014148 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014149 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014150
14151 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014152
Benjamin Petersonc4311282012-10-30 23:21:10 -040014153 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14154 Py_FatalError("Can't initialize field name iterator type");
14155
14156 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14157 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014158
Victor Stinner3a50e702011-10-18 21:21:00 +020014159#ifdef HAVE_MBCS
14160 winver.dwOSVersionInfoSize = sizeof(winver);
14161 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14162 PyErr_SetFromWindowsErr(0);
14163 return -1;
14164 }
14165#endif
14166 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014167}
14168
14169/* Finalize the Unicode implementation */
14170
Christian Heimesa156e092008-02-16 07:38:31 +000014171int
14172PyUnicode_ClearFreeList(void)
14173{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014174 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014175}
14176
Guido van Rossumd57fd912000-03-10 22:53:23 +000014177void
Thomas Wouters78890102000-07-22 19:25:51 +000014178_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014179{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014180 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014181
Serhiy Storchaka05997252013-01-26 12:14:02 +020014182 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014183
Serhiy Storchaka05997252013-01-26 12:14:02 +020014184 for (i = 0; i < 256; i++)
14185 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014186 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014187 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014188}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014189
Walter Dörwald16807132007-05-25 13:52:07 +000014190void
14191PyUnicode_InternInPlace(PyObject **p)
14192{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014193 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014194 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014195#ifdef Py_DEBUG
14196 assert(s != NULL);
14197 assert(_PyUnicode_CHECK(s));
14198#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014199 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014200 return;
14201#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014202 /* If it's a subclass, we don't really know what putting
14203 it in the interned dict might do. */
14204 if (!PyUnicode_CheckExact(s))
14205 return;
14206 if (PyUnicode_CHECK_INTERNED(s))
14207 return;
14208 if (interned == NULL) {
14209 interned = PyDict_New();
14210 if (interned == NULL) {
14211 PyErr_Clear(); /* Don't leave an exception */
14212 return;
14213 }
14214 }
14215 /* It might be that the GetItem call fails even
14216 though the key is present in the dictionary,
14217 namely when this happens during a stack overflow. */
14218 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014219 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014220 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014221
Benjamin Peterson29060642009-01-31 22:14:21 +000014222 if (t) {
14223 Py_INCREF(t);
14224 Py_DECREF(*p);
14225 *p = t;
14226 return;
14227 }
Walter Dörwald16807132007-05-25 13:52:07 +000014228
Benjamin Peterson14339b62009-01-31 16:36:08 +000014229 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014230 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014231 PyErr_Clear();
14232 PyThreadState_GET()->recursion_critical = 0;
14233 return;
14234 }
14235 PyThreadState_GET()->recursion_critical = 0;
14236 /* The two references in interned are not counted by refcnt.
14237 The deallocator will take care of this */
14238 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014239 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014240}
14241
14242void
14243PyUnicode_InternImmortal(PyObject **p)
14244{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014245 PyUnicode_InternInPlace(p);
14246 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014247 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014248 Py_INCREF(*p);
14249 }
Walter Dörwald16807132007-05-25 13:52:07 +000014250}
14251
14252PyObject *
14253PyUnicode_InternFromString(const char *cp)
14254{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014255 PyObject *s = PyUnicode_FromString(cp);
14256 if (s == NULL)
14257 return NULL;
14258 PyUnicode_InternInPlace(&s);
14259 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014260}
14261
Alexander Belopolsky40018472011-02-26 01:02:56 +000014262void
14263_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014264{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014265 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014266 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014267 Py_ssize_t i, n;
14268 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014269
Benjamin Peterson14339b62009-01-31 16:36:08 +000014270 if (interned == NULL || !PyDict_Check(interned))
14271 return;
14272 keys = PyDict_Keys(interned);
14273 if (keys == NULL || !PyList_Check(keys)) {
14274 PyErr_Clear();
14275 return;
14276 }
Walter Dörwald16807132007-05-25 13:52:07 +000014277
Benjamin Peterson14339b62009-01-31 16:36:08 +000014278 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14279 detector, interned unicode strings are not forcibly deallocated;
14280 rather, we give them their stolen references back, and then clear
14281 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014282
Benjamin Peterson14339b62009-01-31 16:36:08 +000014283 n = PyList_GET_SIZE(keys);
14284 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014285 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014286 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014287 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014288 if (PyUnicode_READY(s) == -1) {
14289 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014290 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014291 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014292 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014293 case SSTATE_NOT_INTERNED:
14294 /* XXX Shouldn't happen */
14295 break;
14296 case SSTATE_INTERNED_IMMORTAL:
14297 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014298 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014299 break;
14300 case SSTATE_INTERNED_MORTAL:
14301 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014302 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014303 break;
14304 default:
14305 Py_FatalError("Inconsistent interned string state.");
14306 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014307 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014308 }
14309 fprintf(stderr, "total size of all interned strings: "
14310 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14311 "mortal/immortal\n", mortal_size, immortal_size);
14312 Py_DECREF(keys);
14313 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014314 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014315}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014316
14317
14318/********************* Unicode Iterator **************************/
14319
14320typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014321 PyObject_HEAD
14322 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014323 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014324} unicodeiterobject;
14325
14326static void
14327unicodeiter_dealloc(unicodeiterobject *it)
14328{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014329 _PyObject_GC_UNTRACK(it);
14330 Py_XDECREF(it->it_seq);
14331 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014332}
14333
14334static int
14335unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14336{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014337 Py_VISIT(it->it_seq);
14338 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014339}
14340
14341static PyObject *
14342unicodeiter_next(unicodeiterobject *it)
14343{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014344 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014345
Benjamin Peterson14339b62009-01-31 16:36:08 +000014346 assert(it != NULL);
14347 seq = it->it_seq;
14348 if (seq == NULL)
14349 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014350 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014352 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14353 int kind = PyUnicode_KIND(seq);
14354 void *data = PyUnicode_DATA(seq);
14355 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14356 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014357 if (item != NULL)
14358 ++it->it_index;
14359 return item;
14360 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014361
Benjamin Peterson14339b62009-01-31 16:36:08 +000014362 Py_DECREF(seq);
14363 it->it_seq = NULL;
14364 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014365}
14366
14367static PyObject *
14368unicodeiter_len(unicodeiterobject *it)
14369{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014370 Py_ssize_t len = 0;
14371 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014372 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014373 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014374}
14375
14376PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14377
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014378static PyObject *
14379unicodeiter_reduce(unicodeiterobject *it)
14380{
14381 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014382 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014383 it->it_seq, it->it_index);
14384 } else {
14385 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14386 if (u == NULL)
14387 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014388 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014389 }
14390}
14391
14392PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14393
14394static PyObject *
14395unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14396{
14397 Py_ssize_t index = PyLong_AsSsize_t(state);
14398 if (index == -1 && PyErr_Occurred())
14399 return NULL;
14400 if (index < 0)
14401 index = 0;
14402 it->it_index = index;
14403 Py_RETURN_NONE;
14404}
14405
14406PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14407
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014408static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014409 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014410 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014411 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14412 reduce_doc},
14413 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14414 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014415 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014416};
14417
14418PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014419 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14420 "str_iterator", /* tp_name */
14421 sizeof(unicodeiterobject), /* tp_basicsize */
14422 0, /* tp_itemsize */
14423 /* methods */
14424 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14425 0, /* tp_print */
14426 0, /* tp_getattr */
14427 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014428 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014429 0, /* tp_repr */
14430 0, /* tp_as_number */
14431 0, /* tp_as_sequence */
14432 0, /* tp_as_mapping */
14433 0, /* tp_hash */
14434 0, /* tp_call */
14435 0, /* tp_str */
14436 PyObject_GenericGetAttr, /* tp_getattro */
14437 0, /* tp_setattro */
14438 0, /* tp_as_buffer */
14439 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14440 0, /* tp_doc */
14441 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14442 0, /* tp_clear */
14443 0, /* tp_richcompare */
14444 0, /* tp_weaklistoffset */
14445 PyObject_SelfIter, /* tp_iter */
14446 (iternextfunc)unicodeiter_next, /* tp_iternext */
14447 unicodeiter_methods, /* tp_methods */
14448 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014449};
14450
14451static PyObject *
14452unicode_iter(PyObject *seq)
14453{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014454 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014455
Benjamin Peterson14339b62009-01-31 16:36:08 +000014456 if (!PyUnicode_Check(seq)) {
14457 PyErr_BadInternalCall();
14458 return NULL;
14459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014460 if (PyUnicode_READY(seq) == -1)
14461 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014462 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14463 if (it == NULL)
14464 return NULL;
14465 it->it_index = 0;
14466 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014467 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014468 _PyObject_GC_TRACK(it);
14469 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014470}
14471
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014472
14473size_t
14474Py_UNICODE_strlen(const Py_UNICODE *u)
14475{
14476 int res = 0;
14477 while(*u++)
14478 res++;
14479 return res;
14480}
14481
14482Py_UNICODE*
14483Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14484{
14485 Py_UNICODE *u = s1;
14486 while ((*u++ = *s2++));
14487 return s1;
14488}
14489
14490Py_UNICODE*
14491Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14492{
14493 Py_UNICODE *u = s1;
14494 while ((*u++ = *s2++))
14495 if (n-- == 0)
14496 break;
14497 return s1;
14498}
14499
14500Py_UNICODE*
14501Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14502{
14503 Py_UNICODE *u1 = s1;
14504 u1 += Py_UNICODE_strlen(u1);
14505 Py_UNICODE_strcpy(u1, s2);
14506 return s1;
14507}
14508
14509int
14510Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14511{
14512 while (*s1 && *s2 && *s1 == *s2)
14513 s1++, s2++;
14514 if (*s1 && *s2)
14515 return (*s1 < *s2) ? -1 : +1;
14516 if (*s1)
14517 return 1;
14518 if (*s2)
14519 return -1;
14520 return 0;
14521}
14522
14523int
14524Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14525{
14526 register Py_UNICODE u1, u2;
14527 for (; n != 0; n--) {
14528 u1 = *s1;
14529 u2 = *s2;
14530 if (u1 != u2)
14531 return (u1 < u2) ? -1 : +1;
14532 if (u1 == '\0')
14533 return 0;
14534 s1++;
14535 s2++;
14536 }
14537 return 0;
14538}
14539
14540Py_UNICODE*
14541Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14542{
14543 const Py_UNICODE *p;
14544 for (p = s; *p; p++)
14545 if (*p == c)
14546 return (Py_UNICODE*)p;
14547 return NULL;
14548}
14549
14550Py_UNICODE*
14551Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14552{
14553 const Py_UNICODE *p;
14554 p = s + Py_UNICODE_strlen(s);
14555 while (p != s) {
14556 p--;
14557 if (*p == c)
14558 return (Py_UNICODE*)p;
14559 }
14560 return NULL;
14561}
Victor Stinner331ea922010-08-10 16:37:20 +000014562
Victor Stinner71133ff2010-09-01 23:43:53 +000014563Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014564PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014565{
Victor Stinner577db2c2011-10-11 22:12:48 +020014566 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014567 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014569 if (!PyUnicode_Check(unicode)) {
14570 PyErr_BadArgument();
14571 return NULL;
14572 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014573 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014574 if (u == NULL)
14575 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014576 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014577 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014578 PyErr_NoMemory();
14579 return NULL;
14580 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014581 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014582 size *= sizeof(Py_UNICODE);
14583 copy = PyMem_Malloc(size);
14584 if (copy == NULL) {
14585 PyErr_NoMemory();
14586 return NULL;
14587 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014588 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014589 return copy;
14590}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014591
Georg Brandl66c221e2010-10-14 07:04:07 +000014592/* A _string module, to export formatter_parser and formatter_field_name_split
14593 to the string.Formatter class implemented in Python. */
14594
14595static PyMethodDef _string_methods[] = {
14596 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14597 METH_O, PyDoc_STR("split the argument as a field name")},
14598 {"formatter_parser", (PyCFunction) formatter_parser,
14599 METH_O, PyDoc_STR("parse the argument as a format string")},
14600 {NULL, NULL}
14601};
14602
14603static struct PyModuleDef _string_module = {
14604 PyModuleDef_HEAD_INIT,
14605 "_string",
14606 PyDoc_STR("string helper module"),
14607 0,
14608 _string_methods,
14609 NULL,
14610 NULL,
14611 NULL,
14612 NULL
14613};
14614
14615PyMODINIT_FUNC
14616PyInit__string(void)
14617{
14618 return PyModule_Create(&_string_module);
14619}
14620
14621
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014622#ifdef __cplusplus
14623}
14624#endif