blob: e8459138a82aad3b0b6cbd32f21252cb28361aa3 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
Serhiy Storchaka05997252013-01-26 12:14:02 +020060NOTE: In the interpreter's initialization phase, some globals are currently
61 initialized dynamically as needed. In the process Unicode objects may
62 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000063
64*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000065
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000066
67#ifdef __cplusplus
68extern "C" {
69#endif
70
Victor Stinner8faf8212011-12-08 22:14:11 +010071/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
72#define MAX_UNICODE 0x10ffff
73
Victor Stinner910337b2011-10-03 03:20:16 +020074#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020075# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020076#else
77# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
78#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020079
Victor Stinnere90fe6a2011-10-01 16:48:13 +020080#define _PyUnicode_UTF8(op) \
81 (((PyCompactUnicodeObject*)(op))->utf8)
82#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((char*)((PyASCIIObject*)(op) + 1)) : \
87 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020088#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 (((PyCompactUnicodeObject*)(op))->utf8_length)
90#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020091 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020092 assert(PyUnicode_IS_READY(op)), \
93 PyUnicode_IS_COMPACT_ASCII(op) ? \
94 ((PyASCIIObject*)(op))->length : \
95 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020096#define _PyUnicode_WSTR(op) \
97 (((PyASCIIObject*)(op))->wstr)
98#define _PyUnicode_WSTR_LENGTH(op) \
99 (((PyCompactUnicodeObject*)(op))->wstr_length)
100#define _PyUnicode_LENGTH(op) \
101 (((PyASCIIObject *)(op))->length)
102#define _PyUnicode_STATE(op) \
103 (((PyASCIIObject *)(op))->state)
104#define _PyUnicode_HASH(op) \
105 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_KIND(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200109#define _PyUnicode_GET_LENGTH(op) \
110 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200112#define _PyUnicode_DATA_ANY(op) \
113 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114
Victor Stinnere6abb482012-05-02 01:15:40 +0200115/* Optimized version of Py_MAX() to compute the maximum character:
116 use it when your are computing the second argument of PyUnicode_New() */
117#define MAX_MAXCHAR(maxchar1, maxchar2) \
118 ((maxchar1) | (maxchar2))
119
Victor Stinner910337b2011-10-03 03:20:16 +0200120#undef PyUnicode_READY
121#define PyUnicode_READY(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200124 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100125 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200126
Victor Stinnerc379ead2011-10-03 12:52:27 +0200127#define _PyUnicode_SHARE_UTF8(op) \
128 (assert(_PyUnicode_CHECK(op)), \
129 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
130 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
131#define _PyUnicode_SHARE_WSTR(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
134
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135/* true if the Unicode object has an allocated UTF-8 memory block
136 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200137#define _PyUnicode_HAS_UTF8_MEMORY(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 (!PyUnicode_IS_COMPACT_ASCII(op) \
140 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200141 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
142
Victor Stinner03490912011-10-03 23:45:12 +0200143/* true if the Unicode object has an allocated wstr memory block
144 (not shared with other data) */
145#define _PyUnicode_HAS_WSTR_MEMORY(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 (_PyUnicode_WSTR(op) && \
148 (!PyUnicode_IS_READY(op) || \
149 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
150
Victor Stinner910337b2011-10-03 03:20:16 +0200151/* Generic helper macro to convert characters of different types.
152 from_type and to_type have to be valid type names, begin and end
153 are pointers to the source characters which should be of type
154 "from_type *". to is a pointer of type "to_type *" and points to the
155 buffer where the result characters are written to. */
156#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
157 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200158 to_type *_to = (to_type *) to; \
159 const from_type *_iter = (begin); \
160 const from_type *_end = (end); \
161 Py_ssize_t n = (_end) - (_iter); \
162 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200163 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200164 while (_iter < (_unrolled_end)) { \
165 _to[0] = (to_type) _iter[0]; \
166 _to[1] = (to_type) _iter[1]; \
167 _to[2] = (to_type) _iter[2]; \
168 _to[3] = (to_type) _iter[3]; \
169 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200170 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200171 while (_iter < (_end)) \
172 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200173 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200174
Walter Dörwald16807132007-05-25 13:52:07 +0000175/* This dictionary holds all interned unicode strings. Note that references
176 to strings in this dictionary are *not* counted in the string's ob_refcnt.
177 When the interned string reaches a refcnt of 0 the string deallocation
178 function will delete the reference from this dictionary.
179
180 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000181 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000182*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200183static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200187
Serhiy Storchaka678db842013-01-26 12:16:36 +0200188#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 do { \
190 if (unicode_empty != NULL) \
191 Py_INCREF(unicode_empty); \
192 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200193 unicode_empty = PyUnicode_New(0, 0); \
194 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200195 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200196 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200199 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000200
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201#define _Py_RETURN_UNICODE_EMPTY() \
202 do { \
203 _Py_INCREF_UNICODE_EMPTY(); \
204 return unicode_empty; \
205 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000206
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200207/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200208static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200209
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210/* Single character Unicode strings in the Latin-1 range are being
211 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213
Christian Heimes190d79e2008-01-30 11:58:22 +0000214/* Fast detection of the most frequent whitespace characters */
215const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000217/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000219/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000220/* case 0x000C: * FORM FEED */
221/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 1, 1, 1, 1, 1, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000224/* case 0x001C: * FILE SEPARATOR */
225/* case 0x001D: * GROUP SEPARATOR */
226/* case 0x001E: * RECORD SEPARATOR */
227/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000228 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000229/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000230 1, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000234
Benjamin Peterson14339b62009-01-31 16:36:08 +0000235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000243};
244
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200245/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200246static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200247static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100248static int unicode_modifiable(PyObject *unicode);
249
Victor Stinnerfe226c02011-10-03 03:52:20 +0200250
Alexander Belopolsky40018472011-02-26 01:02:56 +0000251static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200252_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
253static PyObject *
254_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
255static PyObject *
256_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
257
258static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000259unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000260 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100261 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000262 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
263
Alexander Belopolsky40018472011-02-26 01:02:56 +0000264static void
265raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300266 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100267 PyObject *unicode,
268 Py_ssize_t startpos, Py_ssize_t endpos,
269 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000270
Christian Heimes190d79e2008-01-30 11:58:22 +0000271/* Same for linebreaks */
272static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000275/* 0x000B, * LINE TABULATION */
276/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x001C, * FILE SEPARATOR */
281/* 0x001D, * GROUP SEPARATOR */
282/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 1, 1, 1, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000288
Benjamin Peterson14339b62009-01-31 16:36:08 +0000289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000297};
298
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300299/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
300 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000301Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000302PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000303{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000304#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000305 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000306#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 /* This is actually an illegal character, so it should
308 not be passed to unichr. */
309 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000310#endif
311}
312
Victor Stinner910337b2011-10-03 03:20:16 +0200313#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200314int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100315_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200316{
317 PyASCIIObject *ascii;
318 unsigned int kind;
319
320 assert(PyUnicode_Check(op));
321
322 ascii = (PyASCIIObject *)op;
323 kind = ascii->state.kind;
324
Victor Stinnera3b334d2011-10-03 13:53:37 +0200325 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200326 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200327 assert(ascii->state.ready == 1);
328 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200330 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200331 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200332
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 if (ascii->state.compact == 1) {
334 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(kind == PyUnicode_1BYTE_KIND
336 || kind == PyUnicode_2BYTE_KIND
337 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200338 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200339 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200340 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 }
342 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
344
345 data = unicode->data.any;
346 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100347 assert(ascii->length == 0);
348 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200349 assert(ascii->state.compact == 0);
350 assert(ascii->state.ascii == 0);
351 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100352 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200353 assert(ascii->wstr != NULL);
354 assert(data == NULL);
355 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200356 }
357 else {
358 assert(kind == PyUnicode_1BYTE_KIND
359 || kind == PyUnicode_2BYTE_KIND
360 || kind == PyUnicode_4BYTE_KIND);
361 assert(ascii->state.compact == 0);
362 assert(ascii->state.ready == 1);
363 assert(data != NULL);
364 if (ascii->state.ascii) {
365 assert (compact->utf8 == data);
366 assert (compact->utf8_length == ascii->length);
367 }
368 else
369 assert (compact->utf8 != data);
370 }
371 }
372 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200373 if (
374#if SIZEOF_WCHAR_T == 2
375 kind == PyUnicode_2BYTE_KIND
376#else
377 kind == PyUnicode_4BYTE_KIND
378#endif
379 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200380 {
381 assert(ascii->wstr == data);
382 assert(compact->wstr_length == ascii->length);
383 } else
384 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200385 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200386
387 if (compact->utf8 == NULL)
388 assert(compact->utf8_length == 0);
389 if (ascii->wstr == NULL)
390 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200391 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 /* check that the best kind is used */
393 if (check_content && kind != PyUnicode_WCHAR_KIND)
394 {
395 Py_ssize_t i;
396 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200397 void *data;
398 Py_UCS4 ch;
399
400 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 for (i=0; i < ascii->length; i++)
402 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200403 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 if (ch > maxchar)
405 maxchar = ch;
406 }
407 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100408 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200409 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100410 assert(maxchar <= 255);
411 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 else
413 assert(maxchar < 128);
414 }
Victor Stinner77faf692011-11-20 18:56:05 +0100415 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100417 assert(maxchar <= 0xFFFF);
418 }
419 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100421 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100422 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200423 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200424 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400425 return 1;
426}
Victor Stinner910337b2011-10-03 03:20:16 +0200427#endif
428
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429static PyObject*
430unicode_result_wchar(PyObject *unicode)
431{
432#ifndef Py_DEBUG
433 Py_ssize_t len;
434
435 assert(Py_REFCNT(unicode) == 1);
436
437 len = _PyUnicode_WSTR_LENGTH(unicode);
438 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100439 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200440 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100441 }
442
443 if (len == 1) {
444 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
445 if (ch < 256) {
446 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
447 Py_DECREF(unicode);
448 return latin1_char;
449 }
450 }
451
452 if (_PyUnicode_Ready(unicode) < 0) {
453 Py_XDECREF(unicode);
454 return NULL;
455 }
456#else
457 /* don't make the result ready in debug mode to ensure that the caller
458 makes the string ready before using it */
459 assert(_PyUnicode_CheckConsistency(unicode, 1));
460#endif
461 return unicode;
462}
463
464static PyObject*
465unicode_result_ready(PyObject *unicode)
466{
467 Py_ssize_t length;
468
469 length = PyUnicode_GET_LENGTH(unicode);
470 if (length == 0) {
471 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100472 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200473 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100474 }
475 return unicode_empty;
476 }
477
478 if (length == 1) {
479 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
480 if (ch < 256) {
481 PyObject *latin1_char = unicode_latin1[ch];
482 if (latin1_char != NULL) {
483 if (unicode != latin1_char) {
484 Py_INCREF(latin1_char);
485 Py_DECREF(unicode);
486 }
487 return latin1_char;
488 }
489 else {
490 assert(_PyUnicode_CheckConsistency(unicode, 1));
491 Py_INCREF(unicode);
492 unicode_latin1[ch] = unicode;
493 return unicode;
494 }
495 }
496 }
497
498 assert(_PyUnicode_CheckConsistency(unicode, 1));
499 return unicode;
500}
501
502static PyObject*
503unicode_result(PyObject *unicode)
504{
505 assert(_PyUnicode_CHECK(unicode));
506 if (PyUnicode_IS_READY(unicode))
507 return unicode_result_ready(unicode);
508 else
509 return unicode_result_wchar(unicode);
510}
511
Victor Stinnerc4b49542011-12-11 22:44:26 +0100512static PyObject*
513unicode_result_unchanged(PyObject *unicode)
514{
515 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500516 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100517 return NULL;
518 Py_INCREF(unicode);
519 return unicode;
520 }
521 else
522 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100523 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100524}
525
Victor Stinner3a50e702011-10-18 21:21:00 +0200526#ifdef HAVE_MBCS
527static OSVERSIONINFOEX winver;
528#endif
529
Thomas Wouters477c8d52006-05-27 19:21:47 +0000530/* --- Bloom Filters ----------------------------------------------------- */
531
532/* stuff to implement simple "bloom filters" for Unicode characters.
533 to keep things simple, we use a single bitmask, using the least 5
534 bits from each unicode characters as the bit index. */
535
536/* the linebreak mask is set up by Unicode_Init below */
537
Antoine Pitrouf068f942010-01-13 14:19:12 +0000538#if LONG_BIT >= 128
539#define BLOOM_WIDTH 128
540#elif LONG_BIT >= 64
541#define BLOOM_WIDTH 64
542#elif LONG_BIT >= 32
543#define BLOOM_WIDTH 32
544#else
545#error "LONG_BIT is smaller than 32"
546#endif
547
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548#define BLOOM_MASK unsigned long
549
Serhiy Storchaka05997252013-01-26 12:14:02 +0200550static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Antoine Pitrouf068f942010-01-13 14:19:12 +0000552#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
553#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554
Benjamin Peterson29060642009-01-31 22:14:21 +0000555#define BLOOM_LINEBREAK(ch) \
556 ((ch) < 128U ? ascii_linebreak[(ch)] : \
557 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000558
Alexander Belopolsky40018472011-02-26 01:02:56 +0000559Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200560make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000561{
562 /* calculate simple bloom-style bitmask for a given unicode string */
563
Antoine Pitrouf068f942010-01-13 14:19:12 +0000564 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000565 Py_ssize_t i;
566
567 mask = 0;
568 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200569 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000570
571 return mask;
572}
573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200574#define BLOOM_MEMBER(mask, chr, str) \
575 (BLOOM(mask, chr) \
576 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000577
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200578/* Compilation of templated routines */
579
580#include "stringlib/asciilib.h"
581#include "stringlib/fastsearch.h"
582#include "stringlib/partition.h"
583#include "stringlib/split.h"
584#include "stringlib/count.h"
585#include "stringlib/find.h"
586#include "stringlib/find_max_char.h"
587#include "stringlib/localeutil.h"
588#include "stringlib/undef.h"
589
590#include "stringlib/ucs1lib.h"
591#include "stringlib/fastsearch.h"
592#include "stringlib/partition.h"
593#include "stringlib/split.h"
594#include "stringlib/count.h"
595#include "stringlib/find.h"
596#include "stringlib/find_max_char.h"
597#include "stringlib/localeutil.h"
598#include "stringlib/undef.h"
599
600#include "stringlib/ucs2lib.h"
601#include "stringlib/fastsearch.h"
602#include "stringlib/partition.h"
603#include "stringlib/split.h"
604#include "stringlib/count.h"
605#include "stringlib/find.h"
606#include "stringlib/find_max_char.h"
607#include "stringlib/localeutil.h"
608#include "stringlib/undef.h"
609
610#include "stringlib/ucs4lib.h"
611#include "stringlib/fastsearch.h"
612#include "stringlib/partition.h"
613#include "stringlib/split.h"
614#include "stringlib/count.h"
615#include "stringlib/find.h"
616#include "stringlib/find_max_char.h"
617#include "stringlib/localeutil.h"
618#include "stringlib/undef.h"
619
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200620#include "stringlib/unicodedefs.h"
621#include "stringlib/fastsearch.h"
622#include "stringlib/count.h"
623#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100624#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200625
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626/* --- Unicode Object ----------------------------------------------------- */
627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200628static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200629fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200630
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200631Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
632 Py_ssize_t size, Py_UCS4 ch,
633 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200635 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
636
637 switch (kind) {
638 case PyUnicode_1BYTE_KIND:
639 {
640 Py_UCS1 ch1 = (Py_UCS1) ch;
641 if (ch1 == ch)
642 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
643 else
644 return -1;
645 }
646 case PyUnicode_2BYTE_KIND:
647 {
648 Py_UCS2 ch2 = (Py_UCS2) ch;
649 if (ch2 == ch)
650 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
651 else
652 return -1;
653 }
654 case PyUnicode_4BYTE_KIND:
655 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
656 default:
657 assert(0);
658 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200660}
661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662static PyObject*
663resize_compact(PyObject *unicode, Py_ssize_t length)
664{
665 Py_ssize_t char_size;
666 Py_ssize_t struct_size;
667 Py_ssize_t new_size;
668 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100669 PyObject *new_unicode;
Victor Stinner79891572012-05-03 13:43:07 +0200670 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200671 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100672 assert(PyUnicode_IS_COMPACT(unicode));
673
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200674 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100675 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 struct_size = sizeof(PyASCIIObject);
677 else
678 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200679 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
682 PyErr_NoMemory();
683 return NULL;
684 }
685 new_size = (struct_size + (length + 1) * char_size);
686
Victor Stinner84def372011-12-11 20:04:56 +0100687 _Py_DEC_REFTOTAL;
688 _Py_ForgetReference(unicode);
689
690 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
691 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100692 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 PyErr_NoMemory();
694 return NULL;
695 }
Victor Stinner84def372011-12-11 20:04:56 +0100696 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100698
Victor Stinnerfe226c02011-10-03 03:52:20 +0200699 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200700 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200701 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100702 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200703 _PyUnicode_WSTR_LENGTH(unicode) = length;
704 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200705 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
706 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200707 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200708 return unicode;
709}
710
Alexander Belopolsky40018472011-02-26 01:02:56 +0000711static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200712resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000713{
Victor Stinner95663112011-10-04 01:03:50 +0200714 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100715 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200716 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200717 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000718
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 if (PyUnicode_IS_READY(unicode)) {
720 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200721 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200722 void *data;
723
724 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200725 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200726 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
727 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200728
729 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
730 PyErr_NoMemory();
731 return -1;
732 }
733 new_size = (length + 1) * char_size;
734
Victor Stinner7a9105a2011-12-12 00:13:42 +0100735 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
736 {
737 PyObject_DEL(_PyUnicode_UTF8(unicode));
738 _PyUnicode_UTF8(unicode) = NULL;
739 _PyUnicode_UTF8_LENGTH(unicode) = 0;
740 }
741
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742 data = (PyObject *)PyObject_REALLOC(data, new_size);
743 if (data == NULL) {
744 PyErr_NoMemory();
745 return -1;
746 }
747 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200748 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200749 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200750 _PyUnicode_WSTR_LENGTH(unicode) = length;
751 }
752 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200753 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200754 _PyUnicode_UTF8_LENGTH(unicode) = length;
755 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200756 _PyUnicode_LENGTH(unicode) = length;
757 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200758 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200759 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200761 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200762 }
Victor Stinner95663112011-10-04 01:03:50 +0200763 assert(_PyUnicode_WSTR(unicode) != NULL);
764
765 /* check for integer overflow */
766 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
767 PyErr_NoMemory();
768 return -1;
769 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100770 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200771 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100772 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200773 if (!wstr) {
774 PyErr_NoMemory();
775 return -1;
776 }
777 _PyUnicode_WSTR(unicode) = wstr;
778 _PyUnicode_WSTR(unicode)[length] = 0;
779 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200780 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000781 return 0;
782}
783
Victor Stinnerfe226c02011-10-03 03:52:20 +0200784static PyObject*
785resize_copy(PyObject *unicode, Py_ssize_t length)
786{
787 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100788 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200789 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100790
Benjamin Petersonbac79492012-01-14 13:34:47 -0500791 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100792 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200793
794 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
795 if (copy == NULL)
796 return NULL;
797
798 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200799 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200800 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200801 }
802 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200803 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100804
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200805 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200806 if (w == NULL)
807 return NULL;
808 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
809 copy_length = Py_MIN(copy_length, length);
810 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
811 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200812 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 }
814}
815
Guido van Rossumd57fd912000-03-10 22:53:23 +0000816/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000817 Ux0000 terminated; some code (e.g. new_identifier)
818 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000819
820 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000821 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000822
823*/
824
Alexander Belopolsky40018472011-02-26 01:02:56 +0000825static PyUnicodeObject *
826_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000827{
828 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200829 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000830
Thomas Wouters477c8d52006-05-27 19:21:47 +0000831 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000832 if (length == 0 && unicode_empty != NULL) {
833 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200834 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000835 }
836
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000837 /* Ensure we won't overflow the size. */
838 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
839 return (PyUnicodeObject *)PyErr_NoMemory();
840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200841 if (length < 0) {
842 PyErr_SetString(PyExc_SystemError,
843 "Negative size passed to _PyUnicode_New");
844 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000845 }
846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200847 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
848 if (unicode == NULL)
849 return NULL;
850 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
851 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
852 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100853 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000854 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100855 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000856 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200857
Jeremy Hyltond8082792003-09-16 19:41:39 +0000858 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000859 * the caller fails before initializing str -- unicode_resize()
860 * reads str[0], and the Keep-Alive optimization can keep memory
861 * allocated for str alive across a call to unicode_dealloc(unicode).
862 * We don't want unicode_resize to read uninitialized memory in
863 * that case.
864 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200865 _PyUnicode_WSTR(unicode)[0] = 0;
866 _PyUnicode_WSTR(unicode)[length] = 0;
867 _PyUnicode_WSTR_LENGTH(unicode) = length;
868 _PyUnicode_HASH(unicode) = -1;
869 _PyUnicode_STATE(unicode).interned = 0;
870 _PyUnicode_STATE(unicode).kind = 0;
871 _PyUnicode_STATE(unicode).compact = 0;
872 _PyUnicode_STATE(unicode).ready = 0;
873 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200874 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200875 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200876 _PyUnicode_UTF8(unicode) = NULL;
877 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100878 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000879 return unicode;
880}
881
Victor Stinnerf42dc442011-10-02 23:33:16 +0200882static const char*
883unicode_kind_name(PyObject *unicode)
884{
Victor Stinner42dfd712011-10-03 14:41:45 +0200885 /* don't check consistency: unicode_kind_name() is called from
886 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887 if (!PyUnicode_IS_COMPACT(unicode))
888 {
889 if (!PyUnicode_IS_READY(unicode))
890 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600891 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 {
893 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200894 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200895 return "legacy ascii";
896 else
897 return "legacy latin1";
898 case PyUnicode_2BYTE_KIND:
899 return "legacy UCS2";
900 case PyUnicode_4BYTE_KIND:
901 return "legacy UCS4";
902 default:
903 return "<legacy invalid kind>";
904 }
905 }
906 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600907 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200908 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200909 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200910 return "ascii";
911 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200912 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200913 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200914 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200915 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200916 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200917 default:
918 return "<invalid compact kind>";
919 }
920}
921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200922#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923/* Functions wrapping macros for use in debugger */
924char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200925 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200926}
927
928void *_PyUnicode_compact_data(void *unicode) {
929 return _PyUnicode_COMPACT_DATA(unicode);
930}
931void *_PyUnicode_data(void *unicode){
932 printf("obj %p\n", unicode);
933 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
934 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
935 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
936 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
937 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
938 return PyUnicode_DATA(unicode);
939}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200940
941void
942_PyUnicode_Dump(PyObject *op)
943{
944 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200945 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
946 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
947 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200948
Victor Stinnera849a4b2011-10-03 12:12:11 +0200949 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200950 {
951 if (ascii->state.ascii)
952 data = (ascii + 1);
953 else
954 data = (compact + 1);
955 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200956 else
957 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200958 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
959
Victor Stinnera849a4b2011-10-03 12:12:11 +0200960 if (ascii->wstr == data)
961 printf("shared ");
962 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200963
Victor Stinnera3b334d2011-10-03 13:53:37 +0200964 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200965 printf(" (%zu), ", compact->wstr_length);
966 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
967 printf("shared ");
968 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200969 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200970 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200971}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972#endif
973
974PyObject *
975PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
976{
977 PyObject *obj;
978 PyCompactUnicodeObject *unicode;
979 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200980 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200981 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982 Py_ssize_t char_size;
983 Py_ssize_t struct_size;
984
985 /* Optimization for empty strings */
986 if (size == 0 && unicode_empty != NULL) {
987 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200988 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989 }
990
Victor Stinner9e9d6892011-10-04 01:02:02 +0200991 is_ascii = 0;
992 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200993 struct_size = sizeof(PyCompactUnicodeObject);
994 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200995 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200996 char_size = 1;
997 is_ascii = 1;
998 struct_size = sizeof(PyASCIIObject);
999 }
1000 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001001 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001002 char_size = 1;
1003 }
1004 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001005 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001006 char_size = 2;
1007 if (sizeof(wchar_t) == 2)
1008 is_sharing = 1;
1009 }
1010 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001011 if (maxchar > MAX_UNICODE) {
1012 PyErr_SetString(PyExc_SystemError,
1013 "invalid maximum character passed to PyUnicode_New");
1014 return NULL;
1015 }
Victor Stinner8f825062012-04-27 13:55:39 +02001016 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001017 char_size = 4;
1018 if (sizeof(wchar_t) == 4)
1019 is_sharing = 1;
1020 }
1021
1022 /* Ensure we won't overflow the size. */
1023 if (size < 0) {
1024 PyErr_SetString(PyExc_SystemError,
1025 "Negative size passed to PyUnicode_New");
1026 return NULL;
1027 }
1028 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1029 return PyErr_NoMemory();
1030
1031 /* Duplicated allocation code from _PyObject_New() instead of a call to
1032 * PyObject_New() so we are able to allocate space for the object and
1033 * it's data buffer.
1034 */
1035 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1036 if (obj == NULL)
1037 return PyErr_NoMemory();
1038 obj = PyObject_INIT(obj, &PyUnicode_Type);
1039 if (obj == NULL)
1040 return NULL;
1041
1042 unicode = (PyCompactUnicodeObject *)obj;
1043 if (is_ascii)
1044 data = ((PyASCIIObject*)obj) + 1;
1045 else
1046 data = unicode + 1;
1047 _PyUnicode_LENGTH(unicode) = size;
1048 _PyUnicode_HASH(unicode) = -1;
1049 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001050 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 _PyUnicode_STATE(unicode).compact = 1;
1052 _PyUnicode_STATE(unicode).ready = 1;
1053 _PyUnicode_STATE(unicode).ascii = is_ascii;
1054 if (is_ascii) {
1055 ((char*)data)[size] = 0;
1056 _PyUnicode_WSTR(unicode) = NULL;
1057 }
Victor Stinner8f825062012-04-27 13:55:39 +02001058 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001059 ((char*)data)[size] = 0;
1060 _PyUnicode_WSTR(unicode) = NULL;
1061 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001063 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001064 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 else {
1066 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001067 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001068 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001070 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001071 ((Py_UCS4*)data)[size] = 0;
1072 if (is_sharing) {
1073 _PyUnicode_WSTR_LENGTH(unicode) = size;
1074 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1075 }
1076 else {
1077 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1078 _PyUnicode_WSTR(unicode) = NULL;
1079 }
1080 }
Victor Stinner8f825062012-04-27 13:55:39 +02001081#ifdef Py_DEBUG
1082 /* Fill the data with invalid characters to detect bugs earlier.
1083 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1084 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1085 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1086 memset(data, 0xff, size * kind);
1087#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001088 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001089 return obj;
1090}
1091
1092#if SIZEOF_WCHAR_T == 2
1093/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1094 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001095 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001096
1097 This function assumes that unicode can hold one more code point than wstr
1098 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001099static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001101 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001102{
1103 const wchar_t *iter;
1104 Py_UCS4 *ucs4_out;
1105
Victor Stinner910337b2011-10-03 03:20:16 +02001106 assert(unicode != NULL);
1107 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001108 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1109 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1110
1111 for (iter = begin; iter < end; ) {
1112 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1113 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001114 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1115 && (iter+1) < end
1116 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 {
Victor Stinner551ac952011-11-29 22:58:13 +01001118 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001119 iter += 2;
1120 }
1121 else {
1122 *ucs4_out++ = *iter;
1123 iter++;
1124 }
1125 }
1126 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1127 _PyUnicode_GET_LENGTH(unicode)));
1128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129}
1130#endif
1131
Victor Stinnercd9950f2011-10-02 00:34:53 +02001132static int
Victor Stinner488fa492011-12-12 00:01:39 +01001133unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001134{
Victor Stinner488fa492011-12-12 00:01:39 +01001135 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001136 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001137 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001138 return -1;
1139 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001140 return 0;
1141}
1142
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001143static int
1144_copy_characters(PyObject *to, Py_ssize_t to_start,
1145 PyObject *from, Py_ssize_t from_start,
1146 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001148 unsigned int from_kind, to_kind;
1149 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150
Victor Stinneree4544c2012-05-09 22:24:08 +02001151 assert(0 <= how_many);
1152 assert(0 <= from_start);
1153 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001154 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001155 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001156 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001157
Victor Stinnerd3f08822012-05-29 12:57:52 +02001158 assert(PyUnicode_Check(to));
1159 assert(PyUnicode_IS_READY(to));
1160 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1161
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001162 if (how_many == 0)
1163 return 0;
1164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001166 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001167 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001168 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169
Victor Stinnerf1852262012-06-16 16:38:26 +02001170#ifdef Py_DEBUG
1171 if (!check_maxchar
1172 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1173 {
1174 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1175 Py_UCS4 ch;
1176 Py_ssize_t i;
1177 for (i=0; i < how_many; i++) {
1178 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1179 assert(ch <= to_maxchar);
1180 }
1181 }
1182#endif
1183
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001184 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001185 if (check_maxchar
1186 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1187 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001188 /* Writing Latin-1 characters into an ASCII string requires to
1189 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001190 Py_UCS4 max_char;
1191 max_char = ucs1lib_find_max_char(from_data,
1192 (Py_UCS1*)from_data + how_many);
1193 if (max_char >= 128)
1194 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001195 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001196 Py_MEMCPY((char*)to_data + to_kind * to_start,
1197 (char*)from_data + from_kind * from_start,
1198 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001200 else if (from_kind == PyUnicode_1BYTE_KIND
1201 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001202 {
1203 _PyUnicode_CONVERT_BYTES(
1204 Py_UCS1, Py_UCS2,
1205 PyUnicode_1BYTE_DATA(from) + from_start,
1206 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1207 PyUnicode_2BYTE_DATA(to) + to_start
1208 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001209 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001210 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001211 && to_kind == PyUnicode_4BYTE_KIND)
1212 {
1213 _PyUnicode_CONVERT_BYTES(
1214 Py_UCS1, Py_UCS4,
1215 PyUnicode_1BYTE_DATA(from) + from_start,
1216 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1217 PyUnicode_4BYTE_DATA(to) + to_start
1218 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001219 }
1220 else if (from_kind == PyUnicode_2BYTE_KIND
1221 && to_kind == PyUnicode_4BYTE_KIND)
1222 {
1223 _PyUnicode_CONVERT_BYTES(
1224 Py_UCS2, Py_UCS4,
1225 PyUnicode_2BYTE_DATA(from) + from_start,
1226 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1227 PyUnicode_4BYTE_DATA(to) + to_start
1228 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001229 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001230 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001231 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1232
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001233 if (!check_maxchar) {
1234 if (from_kind == PyUnicode_2BYTE_KIND
1235 && to_kind == PyUnicode_1BYTE_KIND)
1236 {
1237 _PyUnicode_CONVERT_BYTES(
1238 Py_UCS2, Py_UCS1,
1239 PyUnicode_2BYTE_DATA(from) + from_start,
1240 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1241 PyUnicode_1BYTE_DATA(to) + to_start
1242 );
1243 }
1244 else if (from_kind == PyUnicode_4BYTE_KIND
1245 && to_kind == PyUnicode_1BYTE_KIND)
1246 {
1247 _PyUnicode_CONVERT_BYTES(
1248 Py_UCS4, Py_UCS1,
1249 PyUnicode_4BYTE_DATA(from) + from_start,
1250 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1251 PyUnicode_1BYTE_DATA(to) + to_start
1252 );
1253 }
1254 else if (from_kind == PyUnicode_4BYTE_KIND
1255 && to_kind == PyUnicode_2BYTE_KIND)
1256 {
1257 _PyUnicode_CONVERT_BYTES(
1258 Py_UCS4, Py_UCS2,
1259 PyUnicode_4BYTE_DATA(from) + from_start,
1260 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1261 PyUnicode_2BYTE_DATA(to) + to_start
1262 );
1263 }
1264 else {
1265 assert(0);
1266 return -1;
1267 }
1268 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001269 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001270 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001271 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001272 Py_ssize_t i;
1273
Victor Stinnera0702ab2011-09-29 14:14:38 +02001274 for (i=0; i < how_many; i++) {
1275 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001276 if (ch > to_maxchar)
1277 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001278 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1279 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001280 }
1281 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001282 return 0;
1283}
1284
Victor Stinnerd3f08822012-05-29 12:57:52 +02001285void
1286_PyUnicode_FastCopyCharacters(
1287 PyObject *to, Py_ssize_t to_start,
1288 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001289{
1290 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1291}
1292
1293Py_ssize_t
1294PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1295 PyObject *from, Py_ssize_t from_start,
1296 Py_ssize_t how_many)
1297{
1298 int err;
1299
1300 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1301 PyErr_BadInternalCall();
1302 return -1;
1303 }
1304
Benjamin Petersonbac79492012-01-14 13:34:47 -05001305 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001306 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001307 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001308 return -1;
1309
Victor Stinnerd3f08822012-05-29 12:57:52 +02001310 if (from_start < 0) {
1311 PyErr_SetString(PyExc_IndexError, "string index out of range");
1312 return -1;
1313 }
1314 if (to_start < 0) {
1315 PyErr_SetString(PyExc_IndexError, "string index out of range");
1316 return -1;
1317 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001318 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1319 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1320 PyErr_Format(PyExc_SystemError,
1321 "Cannot write %zi characters at %zi "
1322 "in a string of %zi characters",
1323 how_many, to_start, PyUnicode_GET_LENGTH(to));
1324 return -1;
1325 }
1326
1327 if (how_many == 0)
1328 return 0;
1329
Victor Stinner488fa492011-12-12 00:01:39 +01001330 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001331 return -1;
1332
1333 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1334 if (err) {
1335 PyErr_Format(PyExc_SystemError,
1336 "Cannot copy %s characters "
1337 "into a string of %s characters",
1338 unicode_kind_name(from),
1339 unicode_kind_name(to));
1340 return -1;
1341 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001342 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343}
1344
Victor Stinner17222162011-09-28 22:15:37 +02001345/* Find the maximum code point and count the number of surrogate pairs so a
1346 correct string length can be computed before converting a string to UCS4.
1347 This function counts single surrogates as a character and not as a pair.
1348
1349 Return 0 on success, or -1 on error. */
1350static int
1351find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1352 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001353{
1354 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001355 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001356
Victor Stinnerc53be962011-10-02 21:33:54 +02001357 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358 *num_surrogates = 0;
1359 *maxchar = 0;
1360
1361 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001363 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1364 && (iter+1) < end
1365 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001367 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001369 iter += 2;
1370 }
1371 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001373 {
1374 ch = *iter;
1375 iter++;
1376 }
1377 if (ch > *maxchar) {
1378 *maxchar = ch;
1379 if (*maxchar > MAX_UNICODE) {
1380 PyErr_Format(PyExc_ValueError,
1381 "character U+%x is not in range [U+0000; U+10ffff]",
1382 ch);
1383 return -1;
1384 }
1385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 }
1387 return 0;
1388}
1389
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001390int
1391_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392{
1393 wchar_t *end;
1394 Py_UCS4 maxchar = 0;
1395 Py_ssize_t num_surrogates;
1396#if SIZEOF_WCHAR_T == 2
1397 Py_ssize_t length_wo_surrogates;
1398#endif
1399
Georg Brandl7597add2011-10-05 16:36:47 +02001400 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001401 strings were created using _PyObject_New() and where no canonical
1402 representation (the str field) has been set yet aka strings
1403 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001404 assert(_PyUnicode_CHECK(unicode));
1405 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001406 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001407 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001408 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001409 /* Actually, it should neither be interned nor be anything else: */
1410 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001413 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001414 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416
1417 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001418 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1419 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 PyErr_NoMemory();
1421 return -1;
1422 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001423 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 _PyUnicode_WSTR(unicode), end,
1425 PyUnicode_1BYTE_DATA(unicode));
1426 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1427 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1428 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1429 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001430 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001431 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001432 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 }
1434 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001435 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001436 _PyUnicode_UTF8(unicode) = NULL;
1437 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 PyObject_FREE(_PyUnicode_WSTR(unicode));
1440 _PyUnicode_WSTR(unicode) = NULL;
1441 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1442 }
1443 /* In this case we might have to convert down from 4-byte native
1444 wchar_t to 2-byte unicode. */
1445 else if (maxchar < 65536) {
1446 assert(num_surrogates == 0 &&
1447 "FindMaxCharAndNumSurrogatePairs() messed up");
1448
Victor Stinner506f5922011-09-28 22:34:18 +02001449#if SIZEOF_WCHAR_T == 2
1450 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001451 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001452 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1453 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1454 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001455 _PyUnicode_UTF8(unicode) = NULL;
1456 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001457#else
1458 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001459 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001460 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001461 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001462 PyErr_NoMemory();
1463 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 }
Victor Stinner506f5922011-09-28 22:34:18 +02001465 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1466 _PyUnicode_WSTR(unicode), end,
1467 PyUnicode_2BYTE_DATA(unicode));
1468 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1469 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1470 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001471 _PyUnicode_UTF8(unicode) = NULL;
1472 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001473 PyObject_FREE(_PyUnicode_WSTR(unicode));
1474 _PyUnicode_WSTR(unicode) = NULL;
1475 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1476#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477 }
1478 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1479 else {
1480#if SIZEOF_WCHAR_T == 2
1481 /* in case the native representation is 2-bytes, we need to allocate a
1482 new normalized 4-byte version. */
1483 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001484 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1485 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001486 PyErr_NoMemory();
1487 return -1;
1488 }
1489 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1490 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001491 _PyUnicode_UTF8(unicode) = NULL;
1492 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001493 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1494 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001495 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001496 PyObject_FREE(_PyUnicode_WSTR(unicode));
1497 _PyUnicode_WSTR(unicode) = NULL;
1498 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1499#else
1500 assert(num_surrogates == 0);
1501
Victor Stinnerc3c74152011-10-02 20:39:55 +02001502 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001503 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001504 _PyUnicode_UTF8(unicode) = NULL;
1505 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001506 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1507#endif
1508 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1509 }
1510 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001511 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001512 return 0;
1513}
1514
Alexander Belopolsky40018472011-02-26 01:02:56 +00001515static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001516unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001517{
Walter Dörwald16807132007-05-25 13:52:07 +00001518 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001519 case SSTATE_NOT_INTERNED:
1520 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001521
Benjamin Peterson29060642009-01-31 22:14:21 +00001522 case SSTATE_INTERNED_MORTAL:
1523 /* revive dead object temporarily for DelItem */
1524 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001525 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001526 Py_FatalError(
1527 "deletion of interned string failed");
1528 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001529
Benjamin Peterson29060642009-01-31 22:14:21 +00001530 case SSTATE_INTERNED_IMMORTAL:
1531 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001532
Benjamin Peterson29060642009-01-31 22:14:21 +00001533 default:
1534 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001535 }
1536
Victor Stinner03490912011-10-03 23:45:12 +02001537 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001539 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001540 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001541 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1542 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001543
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001544 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001545}
1546
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001547#ifdef Py_DEBUG
1548static int
1549unicode_is_singleton(PyObject *unicode)
1550{
1551 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1552 if (unicode == unicode_empty)
1553 return 1;
1554 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1555 {
1556 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1557 if (ch < 256 && unicode_latin1[ch] == unicode)
1558 return 1;
1559 }
1560 return 0;
1561}
1562#endif
1563
Alexander Belopolsky40018472011-02-26 01:02:56 +00001564static int
Victor Stinner488fa492011-12-12 00:01:39 +01001565unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001566{
Victor Stinner488fa492011-12-12 00:01:39 +01001567 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001568 if (Py_REFCNT(unicode) != 1)
1569 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001570 if (_PyUnicode_HASH(unicode) != -1)
1571 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001572 if (PyUnicode_CHECK_INTERNED(unicode))
1573 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001574 if (!PyUnicode_CheckExact(unicode))
1575 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001576#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001577 /* singleton refcount is greater than 1 */
1578 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001579#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 return 1;
1581}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001582
Victor Stinnerfe226c02011-10-03 03:52:20 +02001583static int
1584unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1585{
1586 PyObject *unicode;
1587 Py_ssize_t old_length;
1588
1589 assert(p_unicode != NULL);
1590 unicode = *p_unicode;
1591
1592 assert(unicode != NULL);
1593 assert(PyUnicode_Check(unicode));
1594 assert(0 <= length);
1595
Victor Stinner910337b2011-10-03 03:20:16 +02001596 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001597 old_length = PyUnicode_WSTR_LENGTH(unicode);
1598 else
1599 old_length = PyUnicode_GET_LENGTH(unicode);
1600 if (old_length == length)
1601 return 0;
1602
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001603 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001604 _Py_INCREF_UNICODE_EMPTY();
1605 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001606 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001607 Py_DECREF(*p_unicode);
1608 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001609 return 0;
1610 }
1611
Victor Stinner488fa492011-12-12 00:01:39 +01001612 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001613 PyObject *copy = resize_copy(unicode, length);
1614 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001615 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001616 Py_DECREF(*p_unicode);
1617 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001618 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001619 }
1620
Victor Stinnerfe226c02011-10-03 03:52:20 +02001621 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001622 PyObject *new_unicode = resize_compact(unicode, length);
1623 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001625 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001626 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001627 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001628 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001629}
1630
Alexander Belopolsky40018472011-02-26 01:02:56 +00001631int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001632PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001633{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001634 PyObject *unicode;
1635 if (p_unicode == NULL) {
1636 PyErr_BadInternalCall();
1637 return -1;
1638 }
1639 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001640 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001641 {
1642 PyErr_BadInternalCall();
1643 return -1;
1644 }
1645 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001646}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001647
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001648static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001649unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1650 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001651{
1652 PyObject *result;
1653 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001654 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1656 return 0;
1657 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1658 maxchar);
1659 if (result == NULL)
1660 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001661 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001662 Py_DECREF(*p_unicode);
1663 *p_unicode = result;
1664 return 0;
1665}
1666
1667static int
1668unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1669 Py_UCS4 ch)
1670{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001671 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001672 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001673 return -1;
1674 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1675 PyUnicode_DATA(*p_unicode),
1676 (*pos)++, ch);
1677 return 0;
1678}
1679
Victor Stinnerc5166102012-02-22 13:55:02 +01001680/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001681
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001682 WARNING: The function doesn't copy the terminating null character and
1683 doesn't check the maximum character (may write a latin1 character in an
1684 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001685static void
1686unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1687 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001688{
1689 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1690 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001691 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001692
1693 switch (kind) {
1694 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001695 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001696 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001697 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001698 }
1699 case PyUnicode_2BYTE_KIND: {
1700 Py_UCS2 *start = (Py_UCS2 *)data + index;
1701 Py_UCS2 *ucs2 = start;
1702 assert(index <= PyUnicode_GET_LENGTH(unicode));
1703
Victor Stinner184252a2012-06-16 02:57:41 +02001704 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001705 *ucs2 = (Py_UCS2)*str;
1706
1707 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001708 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001709 }
1710 default: {
1711 Py_UCS4 *start = (Py_UCS4 *)data + index;
1712 Py_UCS4 *ucs4 = start;
1713 assert(kind == PyUnicode_4BYTE_KIND);
1714 assert(index <= PyUnicode_GET_LENGTH(unicode));
1715
Victor Stinner184252a2012-06-16 02:57:41 +02001716 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001717 *ucs4 = (Py_UCS4)*str;
1718
1719 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001720 }
1721 }
1722}
1723
1724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001725static PyObject*
1726get_latin1_char(unsigned char ch)
1727{
Victor Stinnera464fc12011-10-02 20:39:30 +02001728 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001730 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 if (!unicode)
1732 return NULL;
1733 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001734 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 unicode_latin1[ch] = unicode;
1736 }
1737 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001738 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739}
1740
Alexander Belopolsky40018472011-02-26 01:02:56 +00001741PyObject *
1742PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001744 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 Py_UCS4 maxchar = 0;
1746 Py_ssize_t num_surrogates;
1747
1748 if (u == NULL)
1749 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001751 /* If the Unicode data is known at construction time, we can apply
1752 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001755 if (size == 0)
1756 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 /* Single character Unicode objects in the Latin-1 range are
1759 shared when using this constructor */
1760 if (size == 1 && *u < 256)
1761 return get_latin1_char((unsigned char)*u);
1762
1763 /* If not empty and not single character, copy the Unicode data
1764 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001765 if (find_maxchar_surrogates(u, u + size,
1766 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 return NULL;
1768
Victor Stinner8faf8212011-12-08 22:14:11 +01001769 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770 if (!unicode)
1771 return NULL;
1772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773 switch (PyUnicode_KIND(unicode)) {
1774 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001775 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1777 break;
1778 case PyUnicode_2BYTE_KIND:
1779#if Py_UNICODE_SIZE == 2
1780 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1781#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001782 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1784#endif
1785 break;
1786 case PyUnicode_4BYTE_KIND:
1787#if SIZEOF_WCHAR_T == 2
1788 /* This is the only case which has to process surrogates, thus
1789 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001790 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791#else
1792 assert(num_surrogates == 0);
1793 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1794#endif
1795 break;
1796 default:
1797 assert(0 && "Impossible state");
1798 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001800 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801}
1802
Alexander Belopolsky40018472011-02-26 01:02:56 +00001803PyObject *
1804PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001805{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001806 if (size < 0) {
1807 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001808 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001809 return NULL;
1810 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001811 if (u != NULL)
1812 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1813 else
1814 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001815}
1816
Alexander Belopolsky40018472011-02-26 01:02:56 +00001817PyObject *
1818PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001819{
1820 size_t size = strlen(u);
1821 if (size > PY_SSIZE_T_MAX) {
1822 PyErr_SetString(PyExc_OverflowError, "input too long");
1823 return NULL;
1824 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001825 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001826}
1827
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001828PyObject *
1829_PyUnicode_FromId(_Py_Identifier *id)
1830{
1831 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001832 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1833 strlen(id->string),
1834 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001835 if (!id->object)
1836 return NULL;
1837 PyUnicode_InternInPlace(&id->object);
1838 assert(!id->next);
1839 id->next = static_strings;
1840 static_strings = id;
1841 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001842 return id->object;
1843}
1844
1845void
1846_PyUnicode_ClearStaticStrings()
1847{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001848 _Py_Identifier *tmp, *s = static_strings;
1849 while (s) {
1850 Py_DECREF(s->object);
1851 s->object = NULL;
1852 tmp = s->next;
1853 s->next = NULL;
1854 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001855 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001856 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001857}
1858
Benjamin Peterson0df54292012-03-26 14:50:32 -04001859/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001860
Victor Stinnerd3f08822012-05-29 12:57:52 +02001861PyObject*
1862_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001863{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001864 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001865 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001866 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001867#ifdef Py_DEBUG
Victor Stinnere6b2d442011-12-11 21:54:30 +01001868 assert(s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001869#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001870 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001871 }
Victor Stinner785938e2011-12-11 20:09:03 +01001872 unicode = PyUnicode_New(size, 127);
1873 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001874 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001875 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1876 assert(_PyUnicode_CheckConsistency(unicode, 1));
1877 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001878}
1879
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001880static Py_UCS4
1881kind_maxchar_limit(unsigned int kind)
1882{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001883 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001884 case PyUnicode_1BYTE_KIND:
1885 return 0x80;
1886 case PyUnicode_2BYTE_KIND:
1887 return 0x100;
1888 case PyUnicode_4BYTE_KIND:
1889 return 0x10000;
1890 default:
1891 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001892 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001893 }
1894}
1895
Victor Stinnere6abb482012-05-02 01:15:40 +02001896Py_LOCAL_INLINE(Py_UCS4)
1897align_maxchar(Py_UCS4 maxchar)
1898{
1899 if (maxchar <= 127)
1900 return 127;
1901 else if (maxchar <= 255)
1902 return 255;
1903 else if (maxchar <= 65535)
1904 return 65535;
1905 else
1906 return MAX_UNICODE;
1907}
1908
Victor Stinner702c7342011-10-05 13:50:52 +02001909static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001910_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001911{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001913 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001914
Serhiy Storchaka678db842013-01-26 12:16:36 +02001915 if (size == 0)
1916 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001917 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001918 if (size == 1)
1919 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001920
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001921 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001922 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 if (!res)
1924 return NULL;
1925 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001926 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001927 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001928}
1929
Victor Stinnere57b1c02011-09-28 22:20:48 +02001930static PyObject*
1931_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001932{
1933 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001934 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001935
Serhiy Storchaka678db842013-01-26 12:16:36 +02001936 if (size == 0)
1937 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001938 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001939 if (size == 1) {
1940 Py_UCS4 ch = u[0];
1941 if (ch < 256)
1942 return get_latin1_char((unsigned char)ch);
1943
1944 res = PyUnicode_New(1, ch);
1945 if (res == NULL)
1946 return NULL;
1947 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1948 assert(_PyUnicode_CheckConsistency(res, 1));
1949 return res;
1950 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001951
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001952 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001953 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001954 if (!res)
1955 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001956 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001958 else {
1959 _PyUnicode_CONVERT_BYTES(
1960 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1961 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001962 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001963 return res;
1964}
1965
Victor Stinnere57b1c02011-09-28 22:20:48 +02001966static PyObject*
1967_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968{
1969 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001970 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001971
Serhiy Storchaka678db842013-01-26 12:16:36 +02001972 if (size == 0)
1973 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001974 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001975 if (size == 1) {
1976 Py_UCS4 ch = u[0];
1977 if (ch < 256)
1978 return get_latin1_char((unsigned char)ch);
1979
1980 res = PyUnicode_New(1, ch);
1981 if (res == NULL)
1982 return NULL;
1983 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1984 assert(_PyUnicode_CheckConsistency(res, 1));
1985 return res;
1986 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001987
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001988 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001989 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 if (!res)
1991 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001992 if (max_char < 256)
1993 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1994 PyUnicode_1BYTE_DATA(res));
1995 else if (max_char < 0x10000)
1996 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1997 PyUnicode_2BYTE_DATA(res));
1998 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001999 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002000 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002001 return res;
2002}
2003
2004PyObject*
2005PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2006{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002007 if (size < 0) {
2008 PyErr_SetString(PyExc_ValueError, "size must be positive");
2009 return NULL;
2010 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002011 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002012 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002013 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002015 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002017 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002018 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002019 PyErr_SetString(PyExc_SystemError, "invalid kind");
2020 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002021 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022}
2023
Victor Stinnerece58de2012-04-23 23:36:38 +02002024Py_UCS4
2025_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2026{
2027 enum PyUnicode_Kind kind;
2028 void *startptr, *endptr;
2029
2030 assert(PyUnicode_IS_READY(unicode));
2031 assert(0 <= start);
2032 assert(end <= PyUnicode_GET_LENGTH(unicode));
2033 assert(start <= end);
2034
2035 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2036 return PyUnicode_MAX_CHAR_VALUE(unicode);
2037
2038 if (start == end)
2039 return 127;
2040
Victor Stinner94d558b2012-04-27 22:26:58 +02002041 if (PyUnicode_IS_ASCII(unicode))
2042 return 127;
2043
Victor Stinnerece58de2012-04-23 23:36:38 +02002044 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002045 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002046 endptr = (char *)startptr + end * kind;
2047 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002048 switch(kind) {
2049 case PyUnicode_1BYTE_KIND:
2050 return ucs1lib_find_max_char(startptr, endptr);
2051 case PyUnicode_2BYTE_KIND:
2052 return ucs2lib_find_max_char(startptr, endptr);
2053 case PyUnicode_4BYTE_KIND:
2054 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002055 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002056 assert(0);
2057 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002058 }
2059}
2060
Victor Stinner25a4b292011-10-06 12:31:55 +02002061/* Ensure that a string uses the most efficient storage, if it is not the
2062 case: create a new string with of the right kind. Write NULL into *p_unicode
2063 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002064static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002065unicode_adjust_maxchar(PyObject **p_unicode)
2066{
2067 PyObject *unicode, *copy;
2068 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002069 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002070 unsigned int kind;
2071
2072 assert(p_unicode != NULL);
2073 unicode = *p_unicode;
2074 assert(PyUnicode_IS_READY(unicode));
2075 if (PyUnicode_IS_ASCII(unicode))
2076 return;
2077
2078 len = PyUnicode_GET_LENGTH(unicode);
2079 kind = PyUnicode_KIND(unicode);
2080 if (kind == PyUnicode_1BYTE_KIND) {
2081 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002082 max_char = ucs1lib_find_max_char(u, u + len);
2083 if (max_char >= 128)
2084 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002085 }
2086 else if (kind == PyUnicode_2BYTE_KIND) {
2087 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002088 max_char = ucs2lib_find_max_char(u, u + len);
2089 if (max_char >= 256)
2090 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002091 }
2092 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002093 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002094 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002095 max_char = ucs4lib_find_max_char(u, u + len);
2096 if (max_char >= 0x10000)
2097 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002098 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002099 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002100 if (copy != NULL)
2101 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002102 Py_DECREF(unicode);
2103 *p_unicode = copy;
2104}
2105
Victor Stinner034f6cf2011-09-30 02:26:44 +02002106PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002107_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002108{
Victor Stinner87af4f22011-11-21 23:03:47 +01002109 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002110 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002111
Victor Stinner034f6cf2011-09-30 02:26:44 +02002112 if (!PyUnicode_Check(unicode)) {
2113 PyErr_BadInternalCall();
2114 return NULL;
2115 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002116 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002117 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002118
Victor Stinner87af4f22011-11-21 23:03:47 +01002119 length = PyUnicode_GET_LENGTH(unicode);
2120 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002121 if (!copy)
2122 return NULL;
2123 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2124
Victor Stinner87af4f22011-11-21 23:03:47 +01002125 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2126 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002127 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002128 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002129}
2130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131
Victor Stinnerbc603d12011-10-02 01:00:40 +02002132/* Widen Unicode objects to larger buffers. Don't write terminating null
2133 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134
2135void*
2136_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2137{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002138 Py_ssize_t len;
2139 void *result;
2140 unsigned int skind;
2141
Benjamin Petersonbac79492012-01-14 13:34:47 -05002142 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002143 return NULL;
2144
2145 len = PyUnicode_GET_LENGTH(s);
2146 skind = PyUnicode_KIND(s);
2147 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002148 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002149 return NULL;
2150 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002151 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002152 case PyUnicode_2BYTE_KIND:
2153 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2154 if (!result)
2155 return PyErr_NoMemory();
2156 assert(skind == PyUnicode_1BYTE_KIND);
2157 _PyUnicode_CONVERT_BYTES(
2158 Py_UCS1, Py_UCS2,
2159 PyUnicode_1BYTE_DATA(s),
2160 PyUnicode_1BYTE_DATA(s) + len,
2161 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002163 case PyUnicode_4BYTE_KIND:
2164 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2165 if (!result)
2166 return PyErr_NoMemory();
2167 if (skind == PyUnicode_2BYTE_KIND) {
2168 _PyUnicode_CONVERT_BYTES(
2169 Py_UCS2, Py_UCS4,
2170 PyUnicode_2BYTE_DATA(s),
2171 PyUnicode_2BYTE_DATA(s) + len,
2172 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002174 else {
2175 assert(skind == PyUnicode_1BYTE_KIND);
2176 _PyUnicode_CONVERT_BYTES(
2177 Py_UCS1, Py_UCS4,
2178 PyUnicode_1BYTE_DATA(s),
2179 PyUnicode_1BYTE_DATA(s) + len,
2180 result);
2181 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002183 default:
2184 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 }
Victor Stinner01698042011-10-04 00:04:26 +02002186 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002187 return NULL;
2188}
2189
2190static Py_UCS4*
2191as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2192 int copy_null)
2193{
2194 int kind;
2195 void *data;
2196 Py_ssize_t len, targetlen;
2197 if (PyUnicode_READY(string) == -1)
2198 return NULL;
2199 kind = PyUnicode_KIND(string);
2200 data = PyUnicode_DATA(string);
2201 len = PyUnicode_GET_LENGTH(string);
2202 targetlen = len;
2203 if (copy_null)
2204 targetlen++;
2205 if (!target) {
2206 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2207 PyErr_NoMemory();
2208 return NULL;
2209 }
2210 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2211 if (!target) {
2212 PyErr_NoMemory();
2213 return NULL;
2214 }
2215 }
2216 else {
2217 if (targetsize < targetlen) {
2218 PyErr_Format(PyExc_SystemError,
2219 "string is longer than the buffer");
2220 if (copy_null && 0 < targetsize)
2221 target[0] = 0;
2222 return NULL;
2223 }
2224 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002225 if (kind == PyUnicode_1BYTE_KIND) {
2226 Py_UCS1 *start = (Py_UCS1 *) data;
2227 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002229 else if (kind == PyUnicode_2BYTE_KIND) {
2230 Py_UCS2 *start = (Py_UCS2 *) data;
2231 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2232 }
2233 else {
2234 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002236 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 if (copy_null)
2238 target[len] = 0;
2239 return target;
2240}
2241
2242Py_UCS4*
2243PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2244 int copy_null)
2245{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002246 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 PyErr_BadInternalCall();
2248 return NULL;
2249 }
2250 return as_ucs4(string, target, targetsize, copy_null);
2251}
2252
2253Py_UCS4*
2254PyUnicode_AsUCS4Copy(PyObject *string)
2255{
2256 return as_ucs4(string, NULL, 0, 1);
2257}
2258
2259#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002260
Alexander Belopolsky40018472011-02-26 01:02:56 +00002261PyObject *
2262PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002264 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002266 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002267 PyErr_BadInternalCall();
2268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 }
2270
Martin v. Löwis790465f2008-04-05 20:41:37 +00002271 if (size == -1) {
2272 size = wcslen(w);
2273 }
2274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276}
2277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002279
Walter Dörwald346737f2007-05-31 10:44:43 +00002280static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002281makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2282 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002283{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002284 *fmt++ = '%';
2285 if (width) {
2286 if (zeropad)
2287 *fmt++ = '0';
2288 fmt += sprintf(fmt, "%d", width);
2289 }
2290 if (precision)
2291 fmt += sprintf(fmt, ".%d", precision);
2292 if (longflag)
2293 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002294 else if (longlongflag) {
2295 /* longlongflag should only ever be nonzero on machines with
2296 HAVE_LONG_LONG defined */
2297#ifdef HAVE_LONG_LONG
2298 char *f = PY_FORMAT_LONG_LONG;
2299 while (*f)
2300 *fmt++ = *f++;
2301#else
2302 /* we shouldn't ever get here */
2303 assert(0);
2304 *fmt++ = 'l';
2305#endif
2306 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002307 else if (size_tflag) {
2308 char *f = PY_FORMAT_SIZE_T;
2309 while (*f)
2310 *fmt++ = *f++;
2311 }
2312 *fmt++ = c;
2313 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002314}
2315
Victor Stinner96865452011-03-01 23:44:09 +00002316/* helper for PyUnicode_FromFormatV() */
2317
2318static const char*
2319parse_format_flags(const char *f,
2320 int *p_width, int *p_precision,
2321 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2322{
2323 int width, precision, longflag, longlongflag, size_tflag;
2324
2325 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2326 f++;
2327 width = 0;
2328 while (Py_ISDIGIT((unsigned)*f))
2329 width = (width*10) + *f++ - '0';
2330 precision = 0;
2331 if (*f == '.') {
2332 f++;
2333 while (Py_ISDIGIT((unsigned)*f))
2334 precision = (precision*10) + *f++ - '0';
2335 if (*f == '%') {
2336 /* "%.3%s" => f points to "3" */
2337 f--;
2338 }
2339 }
2340 if (*f == '\0') {
2341 /* bogus format "%.1" => go backward, f points to "1" */
2342 f--;
2343 }
2344 if (p_width != NULL)
2345 *p_width = width;
2346 if (p_precision != NULL)
2347 *p_precision = precision;
2348
2349 /* Handle %ld, %lu, %lld and %llu. */
2350 longflag = 0;
2351 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002352 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002353
2354 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002355 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002356 longflag = 1;
2357 ++f;
2358 }
2359#ifdef HAVE_LONG_LONG
2360 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002361 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002362 longlongflag = 1;
2363 f += 2;
2364 }
2365#endif
2366 }
2367 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002368 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002369 size_tflag = 1;
2370 ++f;
2371 }
2372 if (p_longflag != NULL)
2373 *p_longflag = longflag;
2374 if (p_longlongflag != NULL)
2375 *p_longlongflag = longlongflag;
2376 if (p_size_tflag != NULL)
2377 *p_size_tflag = size_tflag;
2378 return f;
2379}
2380
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002381/* maximum number of characters required for output of %ld. 21 characters
2382 allows for 64-bit integers (in decimal) and an optional sign. */
2383#define MAX_LONG_CHARS 21
2384/* maximum number of characters required for output of %lld.
2385 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2386 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2387#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2388
Walter Dörwaldd2034312007-05-18 16:29:38 +00002389PyObject *
2390PyUnicode_FromFormatV(const char *format, va_list vargs)
2391{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002392 va_list count;
2393 Py_ssize_t callcount = 0;
2394 PyObject **callresults = NULL;
2395 PyObject **callresult = NULL;
2396 Py_ssize_t n = 0;
2397 int width = 0;
2398 int precision = 0;
2399 int zeropad;
2400 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002401 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002402 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002403 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002404 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2405 Py_UCS4 argmaxchar;
2406 Py_ssize_t numbersize = 0;
2407 char *numberresults = NULL;
2408 char *numberresult = NULL;
2409 Py_ssize_t i;
2410 int kind;
2411 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002412
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002413 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002414 /* step 1: count the number of %S/%R/%A/%s format specifications
2415 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2416 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002417 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002418 * also estimate a upper bound for all the number formats in the string,
2419 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002420 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002421 for (f = format; *f; f++) {
2422 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002423 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2425 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2426 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2427 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002429 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002430#ifdef HAVE_LONG_LONG
2431 if (longlongflag) {
2432 if (width < MAX_LONG_LONG_CHARS)
2433 width = MAX_LONG_LONG_CHARS;
2434 }
2435 else
2436#endif
2437 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2438 including sign. Decimal takes the most space. This
2439 isn't enough for octal. If a width is specified we
2440 need more (which we allocate later). */
2441 if (width < MAX_LONG_CHARS)
2442 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443
2444 /* account for the size + '\0' to separate numbers
2445 inside of the numberresults buffer */
2446 numbersize += (width + 1);
2447 }
2448 }
2449 else if ((unsigned char)*f > 127) {
2450 PyErr_Format(PyExc_ValueError,
2451 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2452 "string, got a non-ASCII byte: 0x%02x",
2453 (unsigned char)*f);
2454 return NULL;
2455 }
2456 }
2457 /* step 2: allocate memory for the results of
2458 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2459 if (callcount) {
2460 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2461 if (!callresults) {
2462 PyErr_NoMemory();
2463 return NULL;
2464 }
2465 callresult = callresults;
2466 }
2467 /* step 2.5: allocate memory for the results of formating numbers */
2468 if (numbersize) {
2469 numberresults = PyObject_Malloc(numbersize);
2470 if (!numberresults) {
2471 PyErr_NoMemory();
2472 goto fail;
2473 }
2474 numberresult = numberresults;
2475 }
2476
2477 /* step 3: format numbers and figure out how large a buffer we need */
2478 for (f = format; *f; f++) {
2479 if (*f == '%') {
2480 const char* p;
2481 int longflag;
2482 int longlongflag;
2483 int size_tflag;
2484 int numprinted;
2485
2486 p = f;
2487 zeropad = (f[1] == '0');
2488 f = parse_format_flags(f, &width, &precision,
2489 &longflag, &longlongflag, &size_tflag);
2490 switch (*f) {
2491 case 'c':
2492 {
2493 Py_UCS4 ordinal = va_arg(count, int);
Victor Stinnere6abb482012-05-02 01:15:40 +02002494 maxchar = MAX_MAXCHAR(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 n++;
2496 break;
2497 }
2498 case '%':
2499 n++;
2500 break;
2501 case 'i':
2502 case 'd':
2503 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2504 width, precision, *f);
2505 if (longflag)
2506 numprinted = sprintf(numberresult, fmt,
2507 va_arg(count, long));
2508#ifdef HAVE_LONG_LONG
2509 else if (longlongflag)
2510 numprinted = sprintf(numberresult, fmt,
2511 va_arg(count, PY_LONG_LONG));
2512#endif
2513 else if (size_tflag)
2514 numprinted = sprintf(numberresult, fmt,
2515 va_arg(count, Py_ssize_t));
2516 else
2517 numprinted = sprintf(numberresult, fmt,
2518 va_arg(count, int));
2519 n += numprinted;
2520 /* advance by +1 to skip over the '\0' */
2521 numberresult += (numprinted + 1);
2522 assert(*(numberresult - 1) == '\0');
2523 assert(*(numberresult - 2) != '\0');
2524 assert(numprinted >= 0);
2525 assert(numberresult <= numberresults + numbersize);
2526 break;
2527 case 'u':
2528 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2529 width, precision, 'u');
2530 if (longflag)
2531 numprinted = sprintf(numberresult, fmt,
2532 va_arg(count, unsigned long));
2533#ifdef HAVE_LONG_LONG
2534 else if (longlongflag)
2535 numprinted = sprintf(numberresult, fmt,
2536 va_arg(count, unsigned PY_LONG_LONG));
2537#endif
2538 else if (size_tflag)
2539 numprinted = sprintf(numberresult, fmt,
2540 va_arg(count, size_t));
2541 else
2542 numprinted = sprintf(numberresult, fmt,
2543 va_arg(count, unsigned int));
2544 n += numprinted;
2545 numberresult += (numprinted + 1);
2546 assert(*(numberresult - 1) == '\0');
2547 assert(*(numberresult - 2) != '\0');
2548 assert(numprinted >= 0);
2549 assert(numberresult <= numberresults + numbersize);
2550 break;
2551 case 'x':
2552 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2553 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2554 n += numprinted;
2555 numberresult += (numprinted + 1);
2556 assert(*(numberresult - 1) == '\0');
2557 assert(*(numberresult - 2) != '\0');
2558 assert(numprinted >= 0);
2559 assert(numberresult <= numberresults + numbersize);
2560 break;
2561 case 'p':
2562 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2563 /* %p is ill-defined: ensure leading 0x. */
2564 if (numberresult[1] == 'X')
2565 numberresult[1] = 'x';
2566 else if (numberresult[1] != 'x') {
2567 memmove(numberresult + 2, numberresult,
2568 strlen(numberresult) + 1);
2569 numberresult[0] = '0';
2570 numberresult[1] = 'x';
2571 numprinted += 2;
2572 }
2573 n += numprinted;
2574 numberresult += (numprinted + 1);
2575 assert(*(numberresult - 1) == '\0');
2576 assert(*(numberresult - 2) != '\0');
2577 assert(numprinted >= 0);
2578 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002579 break;
2580 case 's':
2581 {
2582 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002583 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002584 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002585 if (!str)
2586 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002587 /* since PyUnicode_DecodeUTF8 returns already flexible
2588 unicode objects, there is no need to call ready on them */
2589 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002590 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002591 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002592 /* Remember the str and switch to the next slot */
2593 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002594 break;
2595 }
2596 case 'U':
2597 {
2598 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002599 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002600 if (PyUnicode_READY(obj) == -1)
2601 goto fail;
2602 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002603 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002605 break;
2606 }
2607 case 'V':
2608 {
2609 PyObject *obj = va_arg(count, PyObject *);
2610 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002611 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002612 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002613 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002614 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002615 if (PyUnicode_READY(obj) == -1)
2616 goto fail;
2617 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002618 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002620 *callresult++ = NULL;
2621 }
2622 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002623 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002624 if (!str_obj)
2625 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002626 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002627 Py_DECREF(str_obj);
2628 goto fail;
2629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002630 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002631 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002632 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002633 *callresult++ = str_obj;
2634 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002635 break;
2636 }
2637 case 'S':
2638 {
2639 PyObject *obj = va_arg(count, PyObject *);
2640 PyObject *str;
2641 assert(obj);
2642 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002643 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002644 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002645 if (PyUnicode_READY(str) == -1) {
2646 Py_DECREF(str);
2647 goto fail;
2648 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002649 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002650 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002651 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002652 /* Remember the str and switch to the next slot */
2653 *callresult++ = str;
2654 break;
2655 }
2656 case 'R':
2657 {
2658 PyObject *obj = va_arg(count, PyObject *);
2659 PyObject *repr;
2660 assert(obj);
2661 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002662 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002663 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002664 if (PyUnicode_READY(repr) == -1) {
2665 Py_DECREF(repr);
2666 goto fail;
2667 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002668 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Victor Stinnere6abb482012-05-02 01:15:40 +02002669 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002670 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002671 /* Remember the repr and switch to the next slot */
2672 *callresult++ = repr;
2673 break;
2674 }
2675 case 'A':
2676 {
2677 PyObject *obj = va_arg(count, PyObject *);
2678 PyObject *ascii;
2679 assert(obj);
2680 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002681 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002682 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002683 if (PyUnicode_READY(ascii) == -1) {
2684 Py_DECREF(ascii);
2685 goto fail;
2686 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002687 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Victor Stinnere6abb482012-05-02 01:15:40 +02002688 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002689 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002690 /* Remember the repr and switch to the next slot */
2691 *callresult++ = ascii;
2692 break;
2693 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 default:
2695 /* if we stumble upon an unknown
2696 formatting code, copy the rest of
2697 the format string to the output
2698 string. (we cannot just skip the
2699 code, since there's no way to know
2700 what's in the argument list) */
2701 n += strlen(p);
2702 goto expand;
2703 }
2704 } else
2705 n++;
2706 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002707 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002708 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002709 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002710 we don't have to resize the string.
2711 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002712 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002713 if (!string)
2714 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002715 kind = PyUnicode_KIND(string);
2716 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002717 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002720 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002721 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002722 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002723
2724 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002725 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2726 /* checking for == because the last argument could be a empty
2727 string, which causes i to point to end, the assert at the end of
2728 the loop */
2729 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002730
Benjamin Peterson14339b62009-01-31 16:36:08 +00002731 switch (*f) {
2732 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002733 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002734 const int ordinal = va_arg(vargs, int);
2735 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002736 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002737 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002738 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002739 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002740 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002741 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002742 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002743 {
Victor Stinner184252a2012-06-16 02:57:41 +02002744 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002745 /* unused, since we already have the result */
2746 if (*f == 'p')
2747 (void) va_arg(vargs, void *);
2748 else
2749 (void) va_arg(vargs, int);
2750 /* extract the result from numberresults and append. */
Victor Stinner184252a2012-06-16 02:57:41 +02002751 len = strlen(numberresult);
2752 unicode_write_cstr(string, i, numberresult, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002753 /* skip over the separating '\0' */
Victor Stinner184252a2012-06-16 02:57:41 +02002754 i += len;
2755 numberresult += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002756 assert(*numberresult == '\0');
2757 numberresult++;
2758 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002759 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002760 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002761 case 's':
2762 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002763 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002764 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002765 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002766 size = PyUnicode_GET_LENGTH(*callresult);
2767 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002768 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002769 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002770 /* We're done with the unicode()/repr() => forget it */
2771 Py_DECREF(*callresult);
2772 /* switch to next unicode()/repr() result */
2773 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002774 break;
2775 }
2776 case 'U':
2777 {
2778 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 Py_ssize_t size;
2780 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2781 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerd3f08822012-05-29 12:57:52 +02002782 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002783 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002784 break;
2785 }
2786 case 'V':
2787 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002788 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002789 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002790 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002791 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792 size = PyUnicode_GET_LENGTH(obj);
2793 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002794 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002795 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002796 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002797 size = PyUnicode_GET_LENGTH(*callresult);
2798 assert(PyUnicode_KIND(*callresult) <=
2799 PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002800 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002801 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002802 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002803 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002804 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002805 break;
2806 }
2807 case 'S':
2808 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002809 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002810 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002811 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002812 /* unused, since we already have the result */
2813 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002814 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002815 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002816 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002817 /* We're done with the unicode()/repr() => forget it */
2818 Py_DECREF(*callresult);
2819 /* switch to next unicode()/repr() result */
2820 ++callresult;
2821 break;
2822 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002823 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002824 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002825 break;
2826 default:
Victor Stinner184252a2012-06-16 02:57:41 +02002827 {
2828 Py_ssize_t len = strlen(p);
2829 unicode_write_cstr(string, i, p, len);
2830 i += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002831 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002832 goto end;
2833 }
Victor Stinner184252a2012-06-16 02:57:41 +02002834 }
Victor Stinner1205f272010-09-11 00:54:47 +00002835 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002836 else {
2837 assert(i < PyUnicode_GET_LENGTH(string));
2838 PyUnicode_WRITE(kind, data, i++, *f);
2839 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002841 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002842
Benjamin Peterson29060642009-01-31 22:14:21 +00002843 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002844 if (callresults)
2845 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002846 if (numberresults)
2847 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002848 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002849 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002850 if (callresults) {
2851 PyObject **callresult2 = callresults;
2852 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002853 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002854 ++callresult2;
2855 }
2856 PyObject_Free(callresults);
2857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002858 if (numberresults)
2859 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002860 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002861}
2862
Walter Dörwaldd2034312007-05-18 16:29:38 +00002863PyObject *
2864PyUnicode_FromFormat(const char *format, ...)
2865{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002866 PyObject* ret;
2867 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002868
2869#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002870 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002871#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002872 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002873#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002874 ret = PyUnicode_FromFormatV(format, vargs);
2875 va_end(vargs);
2876 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002877}
2878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002879#ifdef HAVE_WCHAR_H
2880
Victor Stinner5593d8a2010-10-02 11:11:27 +00002881/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2882 convert a Unicode object to a wide character string.
2883
Victor Stinnerd88d9832011-09-06 02:00:05 +02002884 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002885 character) required to convert the unicode object. Ignore size argument.
2886
Victor Stinnerd88d9832011-09-06 02:00:05 +02002887 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002888 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002889 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002890static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002891unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002892 wchar_t *w,
2893 Py_ssize_t size)
2894{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002895 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002896 const wchar_t *wstr;
2897
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002898 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002899 if (wstr == NULL)
2900 return -1;
2901
Victor Stinner5593d8a2010-10-02 11:11:27 +00002902 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002903 if (size > res)
2904 size = res + 1;
2905 else
2906 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002907 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002908 return res;
2909 }
2910 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002911 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002912}
2913
2914Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002915PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002916 wchar_t *w,
2917 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918{
2919 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002920 PyErr_BadInternalCall();
2921 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002923 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924}
2925
Victor Stinner137c34c2010-09-29 10:25:54 +00002926wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002927PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002928 Py_ssize_t *size)
2929{
2930 wchar_t* buffer;
2931 Py_ssize_t buflen;
2932
2933 if (unicode == NULL) {
2934 PyErr_BadInternalCall();
2935 return NULL;
2936 }
2937
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002938 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002939 if (buflen == -1)
2940 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002941 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002942 PyErr_NoMemory();
2943 return NULL;
2944 }
2945
Victor Stinner137c34c2010-09-29 10:25:54 +00002946 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2947 if (buffer == NULL) {
2948 PyErr_NoMemory();
2949 return NULL;
2950 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002951 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002952 if (buflen == -1) {
2953 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002954 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002955 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002956 if (size != NULL)
2957 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002958 return buffer;
2959}
2960
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002961#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002962
Alexander Belopolsky40018472011-02-26 01:02:56 +00002963PyObject *
2964PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002966 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002967 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002968 PyErr_SetString(PyExc_ValueError,
2969 "chr() arg not in range(0x110000)");
2970 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002971 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002973 if (ordinal < 256)
2974 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002976 v = PyUnicode_New(1, ordinal);
2977 if (v == NULL)
2978 return NULL;
2979 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002980 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002981 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002982}
2983
Alexander Belopolsky40018472011-02-26 01:02:56 +00002984PyObject *
2985PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002987 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002988 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002989 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002990 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002991 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002992 Py_INCREF(obj);
2993 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002994 }
2995 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002996 /* For a Unicode subtype that's not a Unicode object,
2997 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002998 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002999 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003000 PyErr_Format(PyExc_TypeError,
3001 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003002 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003003 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003004}
3005
Alexander Belopolsky40018472011-02-26 01:02:56 +00003006PyObject *
3007PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003008 const char *encoding,
3009 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003010{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003011 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003012 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003013
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003015 PyErr_BadInternalCall();
3016 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003017 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003018
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003019 /* Decoding bytes objects is the most common case and should be fast */
3020 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003021 if (PyBytes_GET_SIZE(obj) == 0)
3022 _Py_RETURN_UNICODE_EMPTY();
3023 v = PyUnicode_Decode(
3024 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3025 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003026 return v;
3027 }
3028
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003029 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 PyErr_SetString(PyExc_TypeError,
3031 "decoding str is not supported");
3032 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003033 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003034
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003035 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3036 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3037 PyErr_Format(PyExc_TypeError,
3038 "coercing to str: need bytes, bytearray "
3039 "or buffer-like object, %.80s found",
3040 Py_TYPE(obj)->tp_name);
3041 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003042 }
Tim Petersced69f82003-09-16 20:30:58 +00003043
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003044 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003045 PyBuffer_Release(&buffer);
3046 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003047 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003048
Serhiy Storchaka05997252013-01-26 12:14:02 +02003049 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003050 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003051 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052}
3053
Victor Stinner600d3be2010-06-10 12:00:55 +00003054/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003055 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3056 1 on success. */
Victor Stinner20b654a2013-01-03 01:08:58 +01003057int
3058_Py_normalize_encoding(const char *encoding,
Victor Stinner37296e82010-06-10 13:36:23 +00003059 char *lower,
3060 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003061{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003062 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003063 char *l;
3064 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003065
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003066 if (encoding == NULL) {
3067 strcpy(lower, "utf-8");
3068 return 1;
3069 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003070 e = encoding;
3071 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003072 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003073 while (*e) {
3074 if (l == l_end)
3075 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003076 if (Py_ISUPPER(*e)) {
3077 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003078 }
3079 else if (*e == '_') {
3080 *l++ = '-';
3081 e++;
3082 }
3083 else {
3084 *l++ = *e++;
3085 }
3086 }
3087 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003088 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003089}
3090
Alexander Belopolsky40018472011-02-26 01:02:56 +00003091PyObject *
3092PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003093 Py_ssize_t size,
3094 const char *encoding,
3095 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003096{
3097 PyObject *buffer = NULL, *unicode;
3098 Py_buffer info;
3099 char lower[11]; /* Enough for any encoding shortcut */
3100
Fred Drakee4315f52000-05-09 19:53:39 +00003101 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01003102 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003103 if ((strcmp(lower, "utf-8") == 0) ||
3104 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003105 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003106 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003107 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003108 (strcmp(lower, "iso-8859-1") == 0))
3109 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003110#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003111 else if (strcmp(lower, "mbcs") == 0)
3112 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003113#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003114 else if (strcmp(lower, "ascii") == 0)
3115 return PyUnicode_DecodeASCII(s, size, errors);
3116 else if (strcmp(lower, "utf-16") == 0)
3117 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3118 else if (strcmp(lower, "utf-32") == 0)
3119 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3120 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121
3122 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003123 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003124 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003125 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003126 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003127 if (buffer == NULL)
3128 goto onError;
3129 unicode = PyCodec_Decode(buffer, encoding, errors);
3130 if (unicode == NULL)
3131 goto onError;
3132 if (!PyUnicode_Check(unicode)) {
3133 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003134 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003135 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136 Py_DECREF(unicode);
3137 goto onError;
3138 }
3139 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003140 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003141
Benjamin Peterson29060642009-01-31 22:14:21 +00003142 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 Py_XDECREF(buffer);
3144 return NULL;
3145}
3146
Alexander Belopolsky40018472011-02-26 01:02:56 +00003147PyObject *
3148PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003149 const char *encoding,
3150 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003151{
3152 PyObject *v;
3153
3154 if (!PyUnicode_Check(unicode)) {
3155 PyErr_BadArgument();
3156 goto onError;
3157 }
3158
3159 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003160 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003161
3162 /* Decode via the codec registry */
3163 v = PyCodec_Decode(unicode, encoding, errors);
3164 if (v == NULL)
3165 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003166 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003167
Benjamin Peterson29060642009-01-31 22:14:21 +00003168 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003169 return NULL;
3170}
3171
Alexander Belopolsky40018472011-02-26 01:02:56 +00003172PyObject *
3173PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003174 const char *encoding,
3175 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003176{
3177 PyObject *v;
3178
3179 if (!PyUnicode_Check(unicode)) {
3180 PyErr_BadArgument();
3181 goto onError;
3182 }
3183
3184 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003185 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003186
3187 /* Decode via the codec registry */
3188 v = PyCodec_Decode(unicode, encoding, errors);
3189 if (v == NULL)
3190 goto onError;
3191 if (!PyUnicode_Check(v)) {
3192 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003193 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003194 Py_TYPE(v)->tp_name);
3195 Py_DECREF(v);
3196 goto onError;
3197 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003198 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003199
Benjamin Peterson29060642009-01-31 22:14:21 +00003200 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003201 return NULL;
3202}
3203
Alexander Belopolsky40018472011-02-26 01:02:56 +00003204PyObject *
3205PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003206 Py_ssize_t size,
3207 const char *encoding,
3208 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209{
3210 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003211
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212 unicode = PyUnicode_FromUnicode(s, size);
3213 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003214 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3216 Py_DECREF(unicode);
3217 return v;
3218}
3219
Alexander Belopolsky40018472011-02-26 01:02:56 +00003220PyObject *
3221PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003222 const char *encoding,
3223 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003224{
3225 PyObject *v;
3226
3227 if (!PyUnicode_Check(unicode)) {
3228 PyErr_BadArgument();
3229 goto onError;
3230 }
3231
3232 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003233 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003234
3235 /* Encode via the codec registry */
3236 v = PyCodec_Encode(unicode, encoding, errors);
3237 if (v == NULL)
3238 goto onError;
3239 return v;
3240
Benjamin Peterson29060642009-01-31 22:14:21 +00003241 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003242 return NULL;
3243}
3244
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003245static size_t
3246wcstombs_errorpos(const wchar_t *wstr)
3247{
3248 size_t len;
3249#if SIZEOF_WCHAR_T == 2
3250 wchar_t buf[3];
3251#else
3252 wchar_t buf[2];
3253#endif
3254 char outbuf[MB_LEN_MAX];
3255 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003256
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003257#if SIZEOF_WCHAR_T == 2
3258 buf[2] = 0;
3259#else
3260 buf[1] = 0;
3261#endif
3262 start = wstr;
3263 while (*wstr != L'\0')
3264 {
3265 previous = wstr;
3266#if SIZEOF_WCHAR_T == 2
3267 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3268 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3269 {
3270 buf[0] = wstr[0];
3271 buf[1] = wstr[1];
3272 wstr += 2;
3273 }
3274 else {
3275 buf[0] = *wstr;
3276 buf[1] = 0;
3277 wstr++;
3278 }
3279#else
3280 buf[0] = *wstr;
3281 wstr++;
3282#endif
3283 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003284 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003285 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003286 }
3287
3288 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003289 return 0;
3290}
3291
Victor Stinner1b579672011-12-17 05:47:23 +01003292static int
3293locale_error_handler(const char *errors, int *surrogateescape)
3294{
3295 if (errors == NULL) {
3296 *surrogateescape = 0;
3297 return 0;
3298 }
3299
3300 if (strcmp(errors, "strict") == 0) {
3301 *surrogateescape = 0;
3302 return 0;
3303 }
3304 if (strcmp(errors, "surrogateescape") == 0) {
3305 *surrogateescape = 1;
3306 return 0;
3307 }
3308 PyErr_Format(PyExc_ValueError,
3309 "only 'strict' and 'surrogateescape' error handlers "
3310 "are supported, not '%s'",
3311 errors);
3312 return -1;
3313}
3314
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003315PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003316PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003317{
3318 Py_ssize_t wlen, wlen2;
3319 wchar_t *wstr;
3320 PyObject *bytes = NULL;
3321 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003322 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003323 PyObject *exc;
3324 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003325 int surrogateescape;
3326
3327 if (locale_error_handler(errors, &surrogateescape) < 0)
3328 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003329
3330 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3331 if (wstr == NULL)
3332 return NULL;
3333
3334 wlen2 = wcslen(wstr);
3335 if (wlen2 != wlen) {
3336 PyMem_Free(wstr);
3337 PyErr_SetString(PyExc_TypeError, "embedded null character");
3338 return NULL;
3339 }
3340
3341 if (surrogateescape) {
3342 /* locale encoding with surrogateescape */
3343 char *str;
3344
3345 str = _Py_wchar2char(wstr, &error_pos);
3346 if (str == NULL) {
3347 if (error_pos == (size_t)-1) {
3348 PyErr_NoMemory();
3349 PyMem_Free(wstr);
3350 return NULL;
3351 }
3352 else {
3353 goto encode_error;
3354 }
3355 }
3356 PyMem_Free(wstr);
3357
3358 bytes = PyBytes_FromString(str);
3359 PyMem_Free(str);
3360 }
3361 else {
3362 size_t len, len2;
3363
3364 len = wcstombs(NULL, wstr, 0);
3365 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003366 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003367 goto encode_error;
3368 }
3369
3370 bytes = PyBytes_FromStringAndSize(NULL, len);
3371 if (bytes == NULL) {
3372 PyMem_Free(wstr);
3373 return NULL;
3374 }
3375
3376 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3377 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003378 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003379 goto encode_error;
3380 }
3381 PyMem_Free(wstr);
3382 }
3383 return bytes;
3384
3385encode_error:
3386 errmsg = strerror(errno);
3387 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003388
3389 if (error_pos == (size_t)-1)
3390 error_pos = wcstombs_errorpos(wstr);
3391
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003392 PyMem_Free(wstr);
3393 Py_XDECREF(bytes);
3394
Victor Stinner2f197072011-12-17 07:08:30 +01003395 if (errmsg != NULL) {
3396 size_t errlen;
3397 wstr = _Py_char2wchar(errmsg, &errlen);
3398 if (wstr != NULL) {
3399 reason = PyUnicode_FromWideChar(wstr, errlen);
3400 PyMem_Free(wstr);
3401 } else
3402 errmsg = NULL;
3403 }
3404 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003405 reason = PyUnicode_FromString(
3406 "wcstombs() encountered an unencodable "
3407 "wide character");
3408 if (reason == NULL)
3409 return NULL;
3410
3411 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3412 "locale", unicode,
3413 (Py_ssize_t)error_pos,
3414 (Py_ssize_t)(error_pos+1),
3415 reason);
3416 Py_DECREF(reason);
3417 if (exc != NULL) {
3418 PyCodec_StrictErrors(exc);
3419 Py_XDECREF(exc);
3420 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003421 return NULL;
3422}
3423
Victor Stinnerad158722010-10-27 00:25:46 +00003424PyObject *
3425PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003426{
Victor Stinner99b95382011-07-04 14:23:54 +02003427#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003428 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003429#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003430 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003431#else
Victor Stinner793b5312011-04-27 00:24:21 +02003432 PyInterpreterState *interp = PyThreadState_GET()->interp;
3433 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3434 cannot use it to encode and decode filenames before it is loaded. Load
3435 the Python codec requires to encode at least its own filename. Use the C
3436 version of the locale codec until the codec registry is initialized and
3437 the Python codec is loaded.
3438
3439 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3440 cannot only rely on it: check also interp->fscodec_initialized for
3441 subinterpreters. */
3442 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003443 return PyUnicode_AsEncodedString(unicode,
3444 Py_FileSystemDefaultEncoding,
3445 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003446 }
3447 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003448 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003449 }
Victor Stinnerad158722010-10-27 00:25:46 +00003450#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003451}
3452
Alexander Belopolsky40018472011-02-26 01:02:56 +00003453PyObject *
3454PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003455 const char *encoding,
3456 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003457{
3458 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003459 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003460
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461 if (!PyUnicode_Check(unicode)) {
3462 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003463 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464 }
Fred Drakee4315f52000-05-09 19:53:39 +00003465
Fred Drakee4315f52000-05-09 19:53:39 +00003466 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01003467 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003468 if ((strcmp(lower, "utf-8") == 0) ||
3469 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003470 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003471 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003472 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003473 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003474 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003475 }
Victor Stinner37296e82010-06-10 13:36:23 +00003476 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003477 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003478 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003479 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003480#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003481 else if (strcmp(lower, "mbcs") == 0)
3482 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003483#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003484 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003485 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003486 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003487
3488 /* Encode via the codec registry */
3489 v = PyCodec_Encode(unicode, encoding, errors);
3490 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003491 return NULL;
3492
3493 /* The normal path */
3494 if (PyBytes_Check(v))
3495 return v;
3496
3497 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003498 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003499 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003500 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003501
3502 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3503 "encoder %s returned bytearray instead of bytes",
3504 encoding);
3505 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003506 Py_DECREF(v);
3507 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003508 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003509
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003510 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3511 Py_DECREF(v);
3512 return b;
3513 }
3514
3515 PyErr_Format(PyExc_TypeError,
3516 "encoder did not return a bytes object (type=%.400s)",
3517 Py_TYPE(v)->tp_name);
3518 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003519 return NULL;
3520}
3521
Alexander Belopolsky40018472011-02-26 01:02:56 +00003522PyObject *
3523PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003524 const char *encoding,
3525 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003526{
3527 PyObject *v;
3528
3529 if (!PyUnicode_Check(unicode)) {
3530 PyErr_BadArgument();
3531 goto onError;
3532 }
3533
3534 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003535 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003536
3537 /* Encode via the codec registry */
3538 v = PyCodec_Encode(unicode, encoding, errors);
3539 if (v == NULL)
3540 goto onError;
3541 if (!PyUnicode_Check(v)) {
3542 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003543 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003544 Py_TYPE(v)->tp_name);
3545 Py_DECREF(v);
3546 goto onError;
3547 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003549
Benjamin Peterson29060642009-01-31 22:14:21 +00003550 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551 return NULL;
3552}
3553
Victor Stinner2f197072011-12-17 07:08:30 +01003554static size_t
3555mbstowcs_errorpos(const char *str, size_t len)
3556{
3557#ifdef HAVE_MBRTOWC
3558 const char *start = str;
3559 mbstate_t mbs;
3560 size_t converted;
3561 wchar_t ch;
3562
3563 memset(&mbs, 0, sizeof mbs);
3564 while (len)
3565 {
3566 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3567 if (converted == 0)
3568 /* Reached end of string */
3569 break;
3570 if (converted == (size_t)-1 || converted == (size_t)-2) {
3571 /* Conversion error or incomplete character */
3572 return str - start;
3573 }
3574 else {
3575 str += converted;
3576 len -= converted;
3577 }
3578 }
3579 /* failed to find the undecodable byte sequence */
3580 return 0;
3581#endif
3582 return 0;
3583}
3584
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003585PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003586PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003587 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003588{
3589 wchar_t smallbuf[256];
3590 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3591 wchar_t *wstr;
3592 size_t wlen, wlen2;
3593 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003594 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003595 size_t error_pos;
3596 char *errmsg;
3597 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003598
3599 if (locale_error_handler(errors, &surrogateescape) < 0)
3600 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003601
3602 if (str[len] != '\0' || len != strlen(str)) {
3603 PyErr_SetString(PyExc_TypeError, "embedded null character");
3604 return NULL;
3605 }
3606
3607 if (surrogateescape)
3608 {
3609 wstr = _Py_char2wchar(str, &wlen);
3610 if (wstr == NULL) {
3611 if (wlen == (size_t)-1)
3612 PyErr_NoMemory();
3613 else
3614 PyErr_SetFromErrno(PyExc_OSError);
3615 return NULL;
3616 }
3617
3618 unicode = PyUnicode_FromWideChar(wstr, wlen);
3619 PyMem_Free(wstr);
3620 }
3621 else {
3622#ifndef HAVE_BROKEN_MBSTOWCS
3623 wlen = mbstowcs(NULL, str, 0);
3624#else
3625 wlen = len;
3626#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003627 if (wlen == (size_t)-1)
3628 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003629 if (wlen+1 <= smallbuf_len) {
3630 wstr = smallbuf;
3631 }
3632 else {
3633 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3634 return PyErr_NoMemory();
3635
3636 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3637 if (!wstr)
3638 return PyErr_NoMemory();
3639 }
3640
3641 /* This shouldn't fail now */
3642 wlen2 = mbstowcs(wstr, str, wlen+1);
3643 if (wlen2 == (size_t)-1) {
3644 if (wstr != smallbuf)
3645 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003646 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003647 }
3648#ifdef HAVE_BROKEN_MBSTOWCS
3649 assert(wlen2 == wlen);
3650#endif
3651 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3652 if (wstr != smallbuf)
3653 PyMem_Free(wstr);
3654 }
3655 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003656
3657decode_error:
3658 errmsg = strerror(errno);
3659 assert(errmsg != NULL);
3660
3661 error_pos = mbstowcs_errorpos(str, len);
3662 if (errmsg != NULL) {
3663 size_t errlen;
3664 wstr = _Py_char2wchar(errmsg, &errlen);
3665 if (wstr != NULL) {
3666 reason = PyUnicode_FromWideChar(wstr, errlen);
3667 PyMem_Free(wstr);
3668 } else
3669 errmsg = NULL;
3670 }
3671 if (errmsg == NULL)
3672 reason = PyUnicode_FromString(
3673 "mbstowcs() encountered an invalid multibyte sequence");
3674 if (reason == NULL)
3675 return NULL;
3676
3677 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3678 "locale", str, len,
3679 (Py_ssize_t)error_pos,
3680 (Py_ssize_t)(error_pos+1),
3681 reason);
3682 Py_DECREF(reason);
3683 if (exc != NULL) {
3684 PyCodec_StrictErrors(exc);
3685 Py_XDECREF(exc);
3686 }
3687 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003688}
3689
3690PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003691PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003692{
3693 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003694 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003695}
3696
3697
3698PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003699PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003700 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003701 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3702}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003703
Christian Heimes5894ba72007-11-04 11:43:14 +00003704PyObject*
3705PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3706{
Victor Stinner99b95382011-07-04 14:23:54 +02003707#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003708 return PyUnicode_DecodeMBCS(s, size, NULL);
3709#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003710 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003711#else
Victor Stinner793b5312011-04-27 00:24:21 +02003712 PyInterpreterState *interp = PyThreadState_GET()->interp;
3713 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3714 cannot use it to encode and decode filenames before it is loaded. Load
3715 the Python codec requires to encode at least its own filename. Use the C
3716 version of the locale codec until the codec registry is initialized and
3717 the Python codec is loaded.
3718
3719 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3720 cannot only rely on it: check also interp->fscodec_initialized for
3721 subinterpreters. */
3722 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003723 return PyUnicode_Decode(s, size,
3724 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003725 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003726 }
3727 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003728 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003729 }
Victor Stinnerad158722010-10-27 00:25:46 +00003730#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003731}
3732
Martin v. Löwis011e8422009-05-05 04:43:17 +00003733
3734int
Antoine Pitrou13348842012-01-29 18:36:34 +01003735_PyUnicode_HasNULChars(PyObject* s)
3736{
3737 static PyObject *nul = NULL;
3738
3739 if (nul == NULL)
3740 nul = PyUnicode_FromStringAndSize("\0", 1);
3741 if (nul == NULL)
3742 return -1;
3743 return PyUnicode_Contains(s, nul);
3744}
3745
3746
3747int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003748PyUnicode_FSConverter(PyObject* arg, void* addr)
3749{
3750 PyObject *output = NULL;
3751 Py_ssize_t size;
3752 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003753 if (arg == NULL) {
3754 Py_DECREF(*(PyObject**)addr);
3755 return 1;
3756 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003757 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003758 output = arg;
3759 Py_INCREF(output);
3760 }
3761 else {
3762 arg = PyUnicode_FromObject(arg);
3763 if (!arg)
3764 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003765 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003766 Py_DECREF(arg);
3767 if (!output)
3768 return 0;
3769 if (!PyBytes_Check(output)) {
3770 Py_DECREF(output);
3771 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3772 return 0;
3773 }
3774 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003775 size = PyBytes_GET_SIZE(output);
3776 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003777 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003778 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003779 Py_DECREF(output);
3780 return 0;
3781 }
3782 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003783 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003784}
3785
3786
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003787int
3788PyUnicode_FSDecoder(PyObject* arg, void* addr)
3789{
3790 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003791 if (arg == NULL) {
3792 Py_DECREF(*(PyObject**)addr);
3793 return 1;
3794 }
3795 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003796 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003797 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003798 output = arg;
3799 Py_INCREF(output);
3800 }
3801 else {
3802 arg = PyBytes_FromObject(arg);
3803 if (!arg)
3804 return 0;
3805 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3806 PyBytes_GET_SIZE(arg));
3807 Py_DECREF(arg);
3808 if (!output)
3809 return 0;
3810 if (!PyUnicode_Check(output)) {
3811 Py_DECREF(output);
3812 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3813 return 0;
3814 }
3815 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003816 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003817 Py_DECREF(output);
3818 return 0;
3819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003820 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003821 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003822 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3823 Py_DECREF(output);
3824 return 0;
3825 }
3826 *(PyObject**)addr = output;
3827 return Py_CLEANUP_SUPPORTED;
3828}
3829
3830
Martin v. Löwis5b222132007-06-10 09:51:05 +00003831char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003832PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003833{
Christian Heimesf3863112007-11-22 07:46:41 +00003834 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003835
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003836 if (!PyUnicode_Check(unicode)) {
3837 PyErr_BadArgument();
3838 return NULL;
3839 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003840 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003841 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003842
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003843 if (PyUnicode_UTF8(unicode) == NULL) {
3844 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3846 if (bytes == NULL)
3847 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003848 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3849 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003850 Py_DECREF(bytes);
3851 return NULL;
3852 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003853 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3854 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3855 PyBytes_AS_STRING(bytes),
3856 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857 Py_DECREF(bytes);
3858 }
3859
3860 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003861 *psize = PyUnicode_UTF8_LENGTH(unicode);
3862 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003863}
3864
3865char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003867{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003868 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3869}
3870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871Py_UNICODE *
3872PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3873{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003874 const unsigned char *one_byte;
3875#if SIZEOF_WCHAR_T == 4
3876 const Py_UCS2 *two_bytes;
3877#else
3878 const Py_UCS4 *four_bytes;
3879 const Py_UCS4 *ucs4_end;
3880 Py_ssize_t num_surrogates;
3881#endif
3882 wchar_t *w;
3883 wchar_t *wchar_end;
3884
3885 if (!PyUnicode_Check(unicode)) {
3886 PyErr_BadArgument();
3887 return NULL;
3888 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003889 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003891 assert(_PyUnicode_KIND(unicode) != 0);
3892 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003893
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003894 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003896 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3897 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003898 num_surrogates = 0;
3899
3900 for (; four_bytes < ucs4_end; ++four_bytes) {
3901 if (*four_bytes > 0xFFFF)
3902 ++num_surrogates;
3903 }
3904
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003905 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3906 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3907 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003908 PyErr_NoMemory();
3909 return NULL;
3910 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003911 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003912
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003913 w = _PyUnicode_WSTR(unicode);
3914 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3915 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003916 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3917 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003918 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003920 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3921 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003922 }
3923 else
3924 *w = *four_bytes;
3925
3926 if (w > wchar_end) {
3927 assert(0 && "Miscalculated string end");
3928 }
3929 }
3930 *w = 0;
3931#else
3932 /* sizeof(wchar_t) == 4 */
3933 Py_FatalError("Impossible unicode object state, wstr and str "
3934 "should share memory already.");
3935 return NULL;
3936#endif
3937 }
3938 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003939 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3940 (_PyUnicode_LENGTH(unicode) + 1));
3941 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003942 PyErr_NoMemory();
3943 return NULL;
3944 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003945 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3946 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3947 w = _PyUnicode_WSTR(unicode);
3948 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003950 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3951 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003952 for (; w < wchar_end; ++one_byte, ++w)
3953 *w = *one_byte;
3954 /* null-terminate the wstr */
3955 *w = 0;
3956 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003957 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003958#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003959 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003960 for (; w < wchar_end; ++two_bytes, ++w)
3961 *w = *two_bytes;
3962 /* null-terminate the wstr */
3963 *w = 0;
3964#else
3965 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003966 PyObject_FREE(_PyUnicode_WSTR(unicode));
3967 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003968 Py_FatalError("Impossible unicode object state, wstr "
3969 "and str should share memory already.");
3970 return NULL;
3971#endif
3972 }
3973 else {
3974 assert(0 && "This should never happen.");
3975 }
3976 }
3977 }
3978 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003979 *size = PyUnicode_WSTR_LENGTH(unicode);
3980 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003981}
3982
Alexander Belopolsky40018472011-02-26 01:02:56 +00003983Py_UNICODE *
3984PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003986 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987}
3988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003989
Alexander Belopolsky40018472011-02-26 01:02:56 +00003990Py_ssize_t
3991PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992{
3993 if (!PyUnicode_Check(unicode)) {
3994 PyErr_BadArgument();
3995 goto onError;
3996 }
3997 return PyUnicode_GET_SIZE(unicode);
3998
Benjamin Peterson29060642009-01-31 22:14:21 +00003999 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 return -1;
4001}
4002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003Py_ssize_t
4004PyUnicode_GetLength(PyObject *unicode)
4005{
Victor Stinner07621332012-06-16 04:53:46 +02004006 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007 PyErr_BadArgument();
4008 return -1;
4009 }
Victor Stinner07621332012-06-16 04:53:46 +02004010 if (PyUnicode_READY(unicode) == -1)
4011 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012 return PyUnicode_GET_LENGTH(unicode);
4013}
4014
4015Py_UCS4
4016PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4017{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004018 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4019 PyErr_BadArgument();
4020 return (Py_UCS4)-1;
4021 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004022 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004023 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004024 return (Py_UCS4)-1;
4025 }
4026 return PyUnicode_READ_CHAR(unicode, index);
4027}
4028
4029int
4030PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4031{
4032 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004033 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004034 return -1;
4035 }
Victor Stinner488fa492011-12-12 00:01:39 +01004036 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004037 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004038 PyErr_SetString(PyExc_IndexError, "string index out of range");
4039 return -1;
4040 }
Victor Stinner488fa492011-12-12 00:01:39 +01004041 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004042 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004043 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4044 PyErr_SetString(PyExc_ValueError, "character out of range");
4045 return -1;
4046 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004047 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4048 index, ch);
4049 return 0;
4050}
4051
Alexander Belopolsky40018472011-02-26 01:02:56 +00004052const char *
4053PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004054{
Victor Stinner42cb4622010-09-01 19:39:01 +00004055 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004056}
4057
Victor Stinner554f3f02010-06-16 23:33:54 +00004058/* create or adjust a UnicodeDecodeError */
4059static void
4060make_decode_exception(PyObject **exceptionObject,
4061 const char *encoding,
4062 const char *input, Py_ssize_t length,
4063 Py_ssize_t startpos, Py_ssize_t endpos,
4064 const char *reason)
4065{
4066 if (*exceptionObject == NULL) {
4067 *exceptionObject = PyUnicodeDecodeError_Create(
4068 encoding, input, length, startpos, endpos, reason);
4069 }
4070 else {
4071 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4072 goto onError;
4073 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4074 goto onError;
4075 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4076 goto onError;
4077 }
4078 return;
4079
4080onError:
4081 Py_DECREF(*exceptionObject);
4082 *exceptionObject = NULL;
4083}
4084
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004085/* error handling callback helper:
4086 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004087 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004088 and adjust various state variables.
4089 return 0 on success, -1 on error
4090*/
4091
Alexander Belopolsky40018472011-02-26 01:02:56 +00004092static int
4093unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004094 const char *encoding, const char *reason,
4095 const char **input, const char **inend, Py_ssize_t *startinpos,
4096 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004097 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004098{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004099 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004100
4101 PyObject *restuple = NULL;
4102 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004103 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004104 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004105 Py_ssize_t requiredsize;
4106 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004107 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004108 int res = -1;
4109
Victor Stinner596a6c42011-11-09 00:02:18 +01004110 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4111 outsize = PyUnicode_GET_LENGTH(*output);
4112 else
4113 outsize = _PyUnicode_WSTR_LENGTH(*output);
4114
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004115 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 *errorHandler = PyCodec_LookupError(errors);
4117 if (*errorHandler == NULL)
4118 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 }
4120
Victor Stinner554f3f02010-06-16 23:33:54 +00004121 make_decode_exception(exceptionObject,
4122 encoding,
4123 *input, *inend - *input,
4124 *startinpos, *endinpos,
4125 reason);
4126 if (*exceptionObject == NULL)
4127 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004128
4129 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4130 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004131 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004133 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004134 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135 }
4136 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004137 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004138 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004139 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004140
4141 /* Copy back the bytes variables, which might have been modified by the
4142 callback */
4143 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4144 if (!inputobj)
4145 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004146 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004147 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004148 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004149 *input = PyBytes_AS_STRING(inputobj);
4150 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004151 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004152 /* we can DECREF safely, as the exception has another reference,
4153 so the object won't go away. */
4154 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004155
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004157 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004158 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004159 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4160 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004161 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162
Victor Stinner596a6c42011-11-09 00:02:18 +01004163 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4164 /* need more space? (at least enough for what we
4165 have+the replacement+the rest of the string (starting
4166 at the new input position), so we won't have to check space
4167 when there are no errors in the rest of the string) */
4168 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4169 requiredsize = *outpos + replen + insize-newpos;
4170 if (requiredsize > outsize) {
4171 if (requiredsize<2*outsize)
4172 requiredsize = 2*outsize;
4173 if (unicode_resize(output, requiredsize) < 0)
4174 goto onError;
4175 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004176 if (unicode_widen(output, *outpos,
4177 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004178 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004179 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004180 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004181 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004182 else {
4183 wchar_t *repwstr;
4184 Py_ssize_t repwlen;
4185 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4186 if (repwstr == NULL)
4187 goto onError;
4188 /* need more space? (at least enough for what we
4189 have+the replacement+the rest of the string (starting
4190 at the new input position), so we won't have to check space
4191 when there are no errors in the rest of the string) */
4192 requiredsize = *outpos + repwlen + insize-newpos;
4193 if (requiredsize > outsize) {
4194 if (requiredsize < 2*outsize)
4195 requiredsize = 2*outsize;
4196 if (unicode_resize(output, requiredsize) < 0)
4197 goto onError;
4198 }
4199 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4200 *outpos += repwlen;
4201 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004202 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004203 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004204
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004205 /* we made it! */
4206 res = 0;
4207
Benjamin Peterson29060642009-01-31 22:14:21 +00004208 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004209 Py_XDECREF(restuple);
4210 return res;
4211}
4212
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004213/* --- UTF-7 Codec -------------------------------------------------------- */
4214
Antoine Pitrou244651a2009-05-04 18:56:13 +00004215/* See RFC2152 for details. We encode conservatively and decode liberally. */
4216
4217/* Three simple macros defining base-64. */
4218
4219/* Is c a base-64 character? */
4220
4221#define IS_BASE64(c) \
4222 (((c) >= 'A' && (c) <= 'Z') || \
4223 ((c) >= 'a' && (c) <= 'z') || \
4224 ((c) >= '0' && (c) <= '9') || \
4225 (c) == '+' || (c) == '/')
4226
4227/* given that c is a base-64 character, what is its base-64 value? */
4228
4229#define FROM_BASE64(c) \
4230 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4231 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4232 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4233 (c) == '+' ? 62 : 63)
4234
4235/* What is the base-64 character of the bottom 6 bits of n? */
4236
4237#define TO_BASE64(n) \
4238 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4239
4240/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4241 * decoded as itself. We are permissive on decoding; the only ASCII
4242 * byte not decoding to itself is the + which begins a base64
4243 * string. */
4244
4245#define DECODE_DIRECT(c) \
4246 ((c) <= 127 && (c) != '+')
4247
4248/* The UTF-7 encoder treats ASCII characters differently according to
4249 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4250 * the above). See RFC2152. This array identifies these different
4251 * sets:
4252 * 0 : "Set D"
4253 * alphanumeric and '(),-./:?
4254 * 1 : "Set O"
4255 * !"#$%&*;<=>@[]^_`{|}
4256 * 2 : "whitespace"
4257 * ht nl cr sp
4258 * 3 : special (must be base64 encoded)
4259 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4260 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004261
Tim Petersced69f82003-09-16 20:30:58 +00004262static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004263char utf7_category[128] = {
4264/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4265 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4266/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4267 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4268/* sp ! " # $ % & ' ( ) * + , - . / */
4269 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4270/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4271 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4272/* @ A B C D E F G H I J K L M N O */
4273 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4274/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4275 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4276/* ` a b c d e f g h i j k l m n o */
4277 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4278/* p q r s t u v w x y z { | } ~ del */
4279 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004280};
4281
Antoine Pitrou244651a2009-05-04 18:56:13 +00004282/* ENCODE_DIRECT: this character should be encoded as itself. The
4283 * answer depends on whether we are encoding set O as itself, and also
4284 * on whether we are encoding whitespace as itself. RFC2152 makes it
4285 * clear that the answers to these questions vary between
4286 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004287
Antoine Pitrou244651a2009-05-04 18:56:13 +00004288#define ENCODE_DIRECT(c, directO, directWS) \
4289 ((c) < 128 && (c) > 0 && \
4290 ((utf7_category[(c)] == 0) || \
4291 (directWS && (utf7_category[(c)] == 2)) || \
4292 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004293
Alexander Belopolsky40018472011-02-26 01:02:56 +00004294PyObject *
4295PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004296 Py_ssize_t size,
4297 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004298{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004299 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4300}
4301
Antoine Pitrou244651a2009-05-04 18:56:13 +00004302/* The decoder. The only state we preserve is our read position,
4303 * i.e. how many characters we have consumed. So if we end in the
4304 * middle of a shift sequence we have to back off the read position
4305 * and the output to the beginning of the sequence, otherwise we lose
4306 * all the shift state (seen bits, number of bits seen, high
4307 * surrogate). */
4308
Alexander Belopolsky40018472011-02-26 01:02:56 +00004309PyObject *
4310PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004311 Py_ssize_t size,
4312 const char *errors,
4313 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004314{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004315 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004316 Py_ssize_t startinpos;
4317 Py_ssize_t endinpos;
4318 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004319 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004320 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004321 const char *errmsg = "";
4322 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004323 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004324 unsigned int base64bits = 0;
4325 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004326 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004327 PyObject *errorHandler = NULL;
4328 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004329
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004330 /* Start off assuming it's all ASCII. Widen later as necessary. */
4331 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004332 if (!unicode)
4333 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004334 if (size == 0) {
4335 if (consumed)
4336 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004337 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004338 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004339
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004340 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004341 e = s + size;
4342
4343 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004344 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004345 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004346 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004347
Antoine Pitrou244651a2009-05-04 18:56:13 +00004348 if (inShift) { /* in a base-64 section */
4349 if (IS_BASE64(ch)) { /* consume a base-64 character */
4350 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4351 base64bits += 6;
4352 s++;
4353 if (base64bits >= 16) {
4354 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004355 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004356 base64bits -= 16;
4357 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4358 if (surrogate) {
4359 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004360 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4361 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004362 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4363 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004364 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004365 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004366 }
4367 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004368 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4369 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004370 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004371 }
4372 }
Victor Stinner551ac952011-11-29 22:58:13 +01004373 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 /* first surrogate */
4375 surrogate = outCh;
4376 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004377 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004378 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4379 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004380 }
4381 }
4382 }
4383 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004384 inShift = 0;
4385 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004386 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004387 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4388 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004389 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004390 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004391 if (base64bits > 0) { /* left-over bits */
4392 if (base64bits >= 6) {
4393 /* We've seen at least one base-64 character */
4394 errmsg = "partial character in shift sequence";
4395 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004396 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004397 else {
4398 /* Some bits remain; they should be zero */
4399 if (base64buffer != 0) {
4400 errmsg = "non-zero padding bits in shift sequence";
4401 goto utf7Error;
4402 }
4403 }
4404 }
4405 if (ch != '-') {
4406 /* '-' is absorbed; other terminating
4407 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004408 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4409 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004410 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004411 }
4412 }
4413 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004414 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004415 s++; /* consume '+' */
4416 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004418 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4419 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004420 }
4421 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004422 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004423 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004424 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004425 }
4426 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004427 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004428 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4429 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004430 s++;
4431 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004432 else {
4433 startinpos = s-starts;
4434 s++;
4435 errmsg = "unexpected special character";
4436 goto utf7Error;
4437 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004438 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004439utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 endinpos = s-starts;
4441 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004442 errors, &errorHandler,
4443 "utf7", errmsg,
4444 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004445 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004446 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004447 }
4448
Antoine Pitrou244651a2009-05-04 18:56:13 +00004449 /* end of string */
4450
4451 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4452 /* if we're in an inconsistent state, that's an error */
4453 if (surrogate ||
4454 (base64bits >= 6) ||
4455 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004456 endinpos = size;
4457 if (unicode_decode_call_errorhandler(
4458 errors, &errorHandler,
4459 "utf7", "unterminated shift sequence",
4460 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004461 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462 goto onError;
4463 if (s < e)
4464 goto restart;
4465 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004466 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467
4468 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004469 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004470 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004471 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004472 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473 }
4474 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004475 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004476 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004477 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004478
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004479 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480 goto onError;
4481
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004482 Py_XDECREF(errorHandler);
4483 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004484 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004485
Benjamin Peterson29060642009-01-31 22:14:21 +00004486 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004487 Py_XDECREF(errorHandler);
4488 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004489 Py_DECREF(unicode);
4490 return NULL;
4491}
4492
4493
Alexander Belopolsky40018472011-02-26 01:02:56 +00004494PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004495_PyUnicode_EncodeUTF7(PyObject *str,
4496 int base64SetO,
4497 int base64WhiteSpace,
4498 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004499{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004500 int kind;
4501 void *data;
4502 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004503 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004504 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004505 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004506 unsigned int base64bits = 0;
4507 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004508 char * out;
4509 char * start;
4510
Benjamin Petersonbac79492012-01-14 13:34:47 -05004511 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004512 return NULL;
4513 kind = PyUnicode_KIND(str);
4514 data = PyUnicode_DATA(str);
4515 len = PyUnicode_GET_LENGTH(str);
4516
4517 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004519
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004520 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004521 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004522 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004523 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004524 if (v == NULL)
4525 return NULL;
4526
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004527 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004528 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004529 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530
Antoine Pitrou244651a2009-05-04 18:56:13 +00004531 if (inShift) {
4532 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4533 /* shifting out */
4534 if (base64bits) { /* output remaining bits */
4535 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4536 base64buffer = 0;
4537 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004538 }
4539 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540 /* Characters not in the BASE64 set implicitly unshift the sequence
4541 so no '-' is required, except if the character is itself a '-' */
4542 if (IS_BASE64(ch) || ch == '-') {
4543 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004544 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004545 *out++ = (char) ch;
4546 }
4547 else {
4548 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004549 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 else { /* not in a shift sequence */
4552 if (ch == '+') {
4553 *out++ = '+';
4554 *out++ = '-';
4555 }
4556 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4557 *out++ = (char) ch;
4558 }
4559 else {
4560 *out++ = '+';
4561 inShift = 1;
4562 goto encode_char;
4563 }
4564 }
4565 continue;
4566encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004568 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004569
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 /* code first surrogate */
4571 base64bits += 16;
4572 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4573 while (base64bits >= 6) {
4574 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4575 base64bits -= 6;
4576 }
4577 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004578 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004579 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580 base64bits += 16;
4581 base64buffer = (base64buffer << 16) | ch;
4582 while (base64bits >= 6) {
4583 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4584 base64bits -= 6;
4585 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004586 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004587 if (base64bits)
4588 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4589 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004590 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004591 if (_PyBytes_Resize(&v, out - start) < 0)
4592 return NULL;
4593 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004594}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004595PyObject *
4596PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4597 Py_ssize_t size,
4598 int base64SetO,
4599 int base64WhiteSpace,
4600 const char *errors)
4601{
4602 PyObject *result;
4603 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4604 if (tmp == NULL)
4605 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004606 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004607 base64WhiteSpace, errors);
4608 Py_DECREF(tmp);
4609 return result;
4610}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004611
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612#undef IS_BASE64
4613#undef FROM_BASE64
4614#undef TO_BASE64
4615#undef DECODE_DIRECT
4616#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004617
Guido van Rossumd57fd912000-03-10 22:53:23 +00004618/* --- UTF-8 Codec -------------------------------------------------------- */
4619
Alexander Belopolsky40018472011-02-26 01:02:56 +00004620PyObject *
4621PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004622 Py_ssize_t size,
4623 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004624{
Walter Dörwald69652032004-09-07 20:24:22 +00004625 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4626}
4627
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004628#include "stringlib/asciilib.h"
4629#include "stringlib/codecs.h"
4630#include "stringlib/undef.h"
4631
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004632#include "stringlib/ucs1lib.h"
4633#include "stringlib/codecs.h"
4634#include "stringlib/undef.h"
4635
4636#include "stringlib/ucs2lib.h"
4637#include "stringlib/codecs.h"
4638#include "stringlib/undef.h"
4639
4640#include "stringlib/ucs4lib.h"
4641#include "stringlib/codecs.h"
4642#include "stringlib/undef.h"
4643
Antoine Pitrouab868312009-01-10 15:40:25 +00004644/* Mask to quickly check whether a C 'long' contains a
4645 non-ASCII, UTF8-encoded char. */
4646#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004647# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004648#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004649# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004650#else
4651# error C 'long' size should be either 4 or 8!
4652#endif
4653
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004654static Py_ssize_t
4655ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004656{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004657 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004658 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004659
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004660#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004661 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4662 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004663 /* Fast path, see in STRINGLIB(utf8_decode) for
4664 an explanation. */
4665 /* Help register allocation */
4666 register const char *_p = p;
4667 register Py_UCS1 * q = dest;
4668 while (_p < aligned_end) {
4669 unsigned long value = *(const unsigned long *) _p;
4670 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004671 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004672 *((unsigned long *)q) = value;
4673 _p += SIZEOF_LONG;
4674 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004675 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004676 p = _p;
4677 while (p < end) {
4678 if ((unsigned char)*p & 0x80)
4679 break;
4680 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004682 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004684#endif
4685 while (p < end) {
4686 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4687 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004688 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004689 /* Help register allocation */
4690 register const char *_p = p;
4691 while (_p < aligned_end) {
4692 unsigned long value = *(unsigned long *) _p;
4693 if (value & ASCII_CHAR_MASK)
4694 break;
4695 _p += SIZEOF_LONG;
4696 }
4697 p = _p;
4698 if (_p == end)
4699 break;
4700 }
4701 if ((unsigned char)*p & 0x80)
4702 break;
4703 ++p;
4704 }
4705 memcpy(dest, start, p - start);
4706 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707}
Antoine Pitrouab868312009-01-10 15:40:25 +00004708
Victor Stinner785938e2011-12-11 20:09:03 +01004709PyObject *
4710PyUnicode_DecodeUTF8Stateful(const char *s,
4711 Py_ssize_t size,
4712 const char *errors,
4713 Py_ssize_t *consumed)
4714{
Victor Stinner785938e2011-12-11 20:09:03 +01004715 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004716 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004717 const char *end = s + size;
4718 Py_ssize_t outpos;
4719
4720 Py_ssize_t startinpos;
4721 Py_ssize_t endinpos;
4722 const char *errmsg = "";
4723 PyObject *errorHandler = NULL;
4724 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004725
4726 if (size == 0) {
4727 if (consumed)
4728 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004729 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004730 }
4731
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004732 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4733 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004734 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004735 *consumed = 1;
4736 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004737 }
4738
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004739 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004740 if (!unicode)
4741 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004742
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004743 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4744 s += outpos;
4745 while (s < end) {
4746 Py_UCS4 ch;
4747 int kind = PyUnicode_KIND(unicode);
4748 if (kind == PyUnicode_1BYTE_KIND) {
4749 if (PyUnicode_IS_ASCII(unicode))
4750 ch = asciilib_utf8_decode(&s, end,
4751 PyUnicode_1BYTE_DATA(unicode), &outpos);
4752 else
4753 ch = ucs1lib_utf8_decode(&s, end,
4754 PyUnicode_1BYTE_DATA(unicode), &outpos);
4755 } else if (kind == PyUnicode_2BYTE_KIND) {
4756 ch = ucs2lib_utf8_decode(&s, end,
4757 PyUnicode_2BYTE_DATA(unicode), &outpos);
4758 } else {
4759 assert(kind == PyUnicode_4BYTE_KIND);
4760 ch = ucs4lib_utf8_decode(&s, end,
4761 PyUnicode_4BYTE_DATA(unicode), &outpos);
4762 }
4763
4764 switch (ch) {
4765 case 0:
4766 if (s == end || consumed)
4767 goto End;
4768 errmsg = "unexpected end of data";
4769 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004770 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004771 break;
4772 case 1:
4773 errmsg = "invalid start byte";
4774 startinpos = s - starts;
4775 endinpos = startinpos + 1;
4776 break;
4777 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004778 case 3:
4779 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004780 errmsg = "invalid continuation byte";
4781 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004782 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004783 break;
4784 default:
4785 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4786 goto onError;
4787 continue;
4788 }
4789
4790 if (unicode_decode_call_errorhandler(
4791 errors, &errorHandler,
4792 "utf-8", errmsg,
4793 &starts, &end, &startinpos, &endinpos, &exc, &s,
4794 &unicode, &outpos))
4795 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004796 }
4797
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004798End:
4799 if (unicode_resize(&unicode, outpos) < 0)
4800 goto onError;
4801
4802 if (consumed)
4803 *consumed = s - starts;
4804
4805 Py_XDECREF(errorHandler);
4806 Py_XDECREF(exc);
4807 assert(_PyUnicode_CheckConsistency(unicode, 1));
4808 return unicode;
4809
4810onError:
4811 Py_XDECREF(errorHandler);
4812 Py_XDECREF(exc);
4813 Py_XDECREF(unicode);
4814 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004815}
4816
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004817#ifdef __APPLE__
4818
4819/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner27b1ca22012-12-03 12:47:59 +01004820 used to decode the command line arguments on Mac OS X.
4821
4822 Return a pointer to a newly allocated wide character string (use
4823 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004824
4825wchar_t*
4826_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4827{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004828 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004829 wchar_t *unicode;
4830 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004831
4832 /* Note: size will always be longer than the resulting Unicode
4833 character count */
Victor Stinner27b1ca22012-12-03 12:47:59 +01004834 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004835 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004836 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4837 if (!unicode)
4838 return NULL;
4839
4840 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004841 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004842 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004843 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004844 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004845#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004846 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004847#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004848 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004849#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004850 if (ch > 0xFF) {
4851#if SIZEOF_WCHAR_T == 4
4852 assert(0);
4853#else
4854 assert(Py_UNICODE_IS_SURROGATE(ch));
4855 /* compute and append the two surrogates: */
4856 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4857 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4858#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004859 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004860 else {
4861 if (!ch && s == e)
4862 break;
4863 /* surrogateescape */
4864 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4865 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004866 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004867 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004868 return unicode;
4869}
4870
4871#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004873/* Primary internal function which creates utf8 encoded bytes objects.
4874
4875 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004876 and allocate exactly as much space needed at the end. Else allocate the
4877 maximum possible needed (4 result bytes per Unicode character), and return
4878 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004879*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004880PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004881_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882{
Victor Stinner6099a032011-12-18 14:22:26 +01004883 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004884 void *data;
4885 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004887 if (!PyUnicode_Check(unicode)) {
4888 PyErr_BadArgument();
4889 return NULL;
4890 }
4891
4892 if (PyUnicode_READY(unicode) == -1)
4893 return NULL;
4894
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004895 if (PyUnicode_UTF8(unicode))
4896 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4897 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004898
4899 kind = PyUnicode_KIND(unicode);
4900 data = PyUnicode_DATA(unicode);
4901 size = PyUnicode_GET_LENGTH(unicode);
4902
Benjamin Petersonead6b532011-12-20 17:23:42 -06004903 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004904 default:
4905 assert(0);
4906 case PyUnicode_1BYTE_KIND:
4907 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4908 assert(!PyUnicode_IS_ASCII(unicode));
4909 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4910 case PyUnicode_2BYTE_KIND:
4911 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4912 case PyUnicode_4BYTE_KIND:
4913 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004914 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004915}
4916
Alexander Belopolsky40018472011-02-26 01:02:56 +00004917PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004918PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4919 Py_ssize_t size,
4920 const char *errors)
4921{
4922 PyObject *v, *unicode;
4923
4924 unicode = PyUnicode_FromUnicode(s, size);
4925 if (unicode == NULL)
4926 return NULL;
4927 v = _PyUnicode_AsUTF8String(unicode, errors);
4928 Py_DECREF(unicode);
4929 return v;
4930}
4931
4932PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004933PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004935 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936}
4937
Walter Dörwald41980ca2007-08-16 21:55:45 +00004938/* --- UTF-32 Codec ------------------------------------------------------- */
4939
4940PyObject *
4941PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004942 Py_ssize_t size,
4943 const char *errors,
4944 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004945{
4946 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4947}
4948
4949PyObject *
4950PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004951 Py_ssize_t size,
4952 const char *errors,
4953 int *byteorder,
4954 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004955{
4956 const char *starts = s;
4957 Py_ssize_t startinpos;
4958 Py_ssize_t endinpos;
4959 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004960 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004961 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004962 int bo = 0; /* assume native ordering by default */
4963 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004964 /* Offsets from q for retrieving bytes in the right order. */
4965#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4966 int iorder[] = {0, 1, 2, 3};
4967#else
4968 int iorder[] = {3, 2, 1, 0};
4969#endif
4970 PyObject *errorHandler = NULL;
4971 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004972
Walter Dörwald41980ca2007-08-16 21:55:45 +00004973 q = (unsigned char *)s;
4974 e = q + size;
4975
4976 if (byteorder)
4977 bo = *byteorder;
4978
4979 /* Check for BOM marks (U+FEFF) in the input and adjust current
4980 byte order setting accordingly. In native mode, the leading BOM
4981 mark is skipped, in all other modes, it is copied to the output
4982 stream as-is (giving a ZWNBSP character). */
4983 if (bo == 0) {
4984 if (size >= 4) {
4985 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004987#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004988 if (bom == 0x0000FEFF) {
4989 q += 4;
4990 bo = -1;
4991 }
4992 else if (bom == 0xFFFE0000) {
4993 q += 4;
4994 bo = 1;
4995 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004996#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004997 if (bom == 0x0000FEFF) {
4998 q += 4;
4999 bo = 1;
5000 }
5001 else if (bom == 0xFFFE0000) {
5002 q += 4;
5003 bo = -1;
5004 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005005#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005006 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005007 }
5008
5009 if (bo == -1) {
5010 /* force LE */
5011 iorder[0] = 0;
5012 iorder[1] = 1;
5013 iorder[2] = 2;
5014 iorder[3] = 3;
5015 }
5016 else if (bo == 1) {
5017 /* force BE */
5018 iorder[0] = 3;
5019 iorder[1] = 2;
5020 iorder[2] = 1;
5021 iorder[3] = 0;
5022 }
5023
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005024 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005025 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005026 if (!unicode)
5027 return NULL;
5028 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005029 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005030 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005031
Walter Dörwald41980ca2007-08-16 21:55:45 +00005032 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 Py_UCS4 ch;
5034 /* remaining bytes at the end? (size should be divisible by 4) */
5035 if (e-q<4) {
5036 if (consumed)
5037 break;
5038 errmsg = "truncated data";
5039 startinpos = ((const char *)q)-starts;
5040 endinpos = ((const char *)e)-starts;
5041 goto utf32Error;
5042 /* The remaining input chars are ignored if the callback
5043 chooses to skip the input */
5044 }
5045 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5046 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005047
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 if (ch >= 0x110000)
5049 {
5050 errmsg = "codepoint not in range(0x110000)";
5051 startinpos = ((const char *)q)-starts;
5052 endinpos = startinpos+4;
5053 goto utf32Error;
5054 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005055 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5056 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005057 q += 4;
5058 continue;
5059 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005060 if (unicode_decode_call_errorhandler(
5061 errors, &errorHandler,
5062 "utf32", errmsg,
5063 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005064 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005066 }
5067
5068 if (byteorder)
5069 *byteorder = bo;
5070
5071 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005072 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005073
5074 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005075 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005076 goto onError;
5077
5078 Py_XDECREF(errorHandler);
5079 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005080 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005081
Benjamin Peterson29060642009-01-31 22:14:21 +00005082 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005083 Py_DECREF(unicode);
5084 Py_XDECREF(errorHandler);
5085 Py_XDECREF(exc);
5086 return NULL;
5087}
5088
5089PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005090_PyUnicode_EncodeUTF32(PyObject *str,
5091 const char *errors,
5092 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005093{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005094 int kind;
5095 void *data;
5096 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005097 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005099 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005100 /* Offsets from p for storing byte pairs in the right order. */
5101#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5102 int iorder[] = {0, 1, 2, 3};
5103#else
5104 int iorder[] = {3, 2, 1, 0};
5105#endif
5106
Benjamin Peterson29060642009-01-31 22:14:21 +00005107#define STORECHAR(CH) \
5108 do { \
5109 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5110 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5111 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5112 p[iorder[0]] = (CH) & 0xff; \
5113 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005114 } while(0)
5115
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005116 if (!PyUnicode_Check(str)) {
5117 PyErr_BadArgument();
5118 return NULL;
5119 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005120 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005121 return NULL;
5122 kind = PyUnicode_KIND(str);
5123 data = PyUnicode_DATA(str);
5124 len = PyUnicode_GET_LENGTH(str);
5125
5126 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005127 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005128 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005129 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005130 if (v == NULL)
5131 return NULL;
5132
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005133 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005134 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005135 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005136 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005137 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005138
5139 if (byteorder == -1) {
5140 /* force LE */
5141 iorder[0] = 0;
5142 iorder[1] = 1;
5143 iorder[2] = 2;
5144 iorder[3] = 3;
5145 }
5146 else if (byteorder == 1) {
5147 /* force BE */
5148 iorder[0] = 3;
5149 iorder[1] = 2;
5150 iorder[2] = 1;
5151 iorder[3] = 0;
5152 }
5153
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005154 for (i = 0; i < len; i++)
5155 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005156
5157 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005158 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005159#undef STORECHAR
5160}
5161
Alexander Belopolsky40018472011-02-26 01:02:56 +00005162PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005163PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5164 Py_ssize_t size,
5165 const char *errors,
5166 int byteorder)
5167{
5168 PyObject *result;
5169 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5170 if (tmp == NULL)
5171 return NULL;
5172 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5173 Py_DECREF(tmp);
5174 return result;
5175}
5176
5177PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005178PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005179{
Victor Stinnerb960b342011-11-20 19:12:52 +01005180 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005181}
5182
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183/* --- UTF-16 Codec ------------------------------------------------------- */
5184
Tim Peters772747b2001-08-09 22:21:55 +00005185PyObject *
5186PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005187 Py_ssize_t size,
5188 const char *errors,
5189 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190{
Walter Dörwald69652032004-09-07 20:24:22 +00005191 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5192}
5193
5194PyObject *
5195PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 Py_ssize_t size,
5197 const char *errors,
5198 int *byteorder,
5199 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005200{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005201 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005202 Py_ssize_t startinpos;
5203 Py_ssize_t endinpos;
5204 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005205 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005206 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005207 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005208 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005209 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005210 PyObject *errorHandler = NULL;
5211 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212
Tim Peters772747b2001-08-09 22:21:55 +00005213 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005214 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
5216 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005217 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005219 /* Check for BOM marks (U+FEFF) in the input and adjust current
5220 byte order setting accordingly. In native mode, the leading BOM
5221 mark is skipped, in all other modes, it is copied to the output
5222 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005223 if (bo == 0 && size >= 2) {
5224 const Py_UCS4 bom = (q[1] << 8) | q[0];
5225 if (bom == 0xFEFF) {
5226 q += 2;
5227 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005229 else if (bom == 0xFFFE) {
5230 q += 2;
5231 bo = 1;
5232 }
5233 if (byteorder)
5234 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236
Antoine Pitrou63065d72012-05-15 23:48:04 +02005237 if (q == e) {
5238 if (consumed)
5239 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005240 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005241 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005242
Antoine Pitrouab868312009-01-10 15:40:25 +00005243#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005244 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005245#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005246 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005247#endif
Tim Peters772747b2001-08-09 22:21:55 +00005248
Antoine Pitrou63065d72012-05-15 23:48:04 +02005249 /* Note: size will always be longer than the resulting Unicode
5250 character count */
5251 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5252 if (!unicode)
5253 return NULL;
5254
5255 outpos = 0;
5256 while (1) {
5257 Py_UCS4 ch = 0;
5258 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005259 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005260 if (kind == PyUnicode_1BYTE_KIND) {
5261 if (PyUnicode_IS_ASCII(unicode))
5262 ch = asciilib_utf16_decode(&q, e,
5263 PyUnicode_1BYTE_DATA(unicode), &outpos,
5264 native_ordering);
5265 else
5266 ch = ucs1lib_utf16_decode(&q, e,
5267 PyUnicode_1BYTE_DATA(unicode), &outpos,
5268 native_ordering);
5269 } else if (kind == PyUnicode_2BYTE_KIND) {
5270 ch = ucs2lib_utf16_decode(&q, e,
5271 PyUnicode_2BYTE_DATA(unicode), &outpos,
5272 native_ordering);
5273 } else {
5274 assert(kind == PyUnicode_4BYTE_KIND);
5275 ch = ucs4lib_utf16_decode(&q, e,
5276 PyUnicode_4BYTE_DATA(unicode), &outpos,
5277 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005278 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005279 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005280
Antoine Pitrou63065d72012-05-15 23:48:04 +02005281 switch (ch)
5282 {
5283 case 0:
5284 /* remaining byte at the end? (size should be even) */
5285 if (q == e || consumed)
5286 goto End;
5287 errmsg = "truncated data";
5288 startinpos = ((const char *)q) - starts;
5289 endinpos = ((const char *)e) - starts;
5290 break;
5291 /* The remaining input chars are ignored if the callback
5292 chooses to skip the input */
5293 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005294 q -= 2;
5295 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005296 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005297 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005298 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005299 endinpos = ((const char *)e) - starts;
5300 break;
5301 case 2:
5302 errmsg = "illegal encoding";
5303 startinpos = ((const char *)q) - 2 - starts;
5304 endinpos = startinpos + 2;
5305 break;
5306 case 3:
5307 errmsg = "illegal UTF-16 surrogate";
5308 startinpos = ((const char *)q) - 4 - starts;
5309 endinpos = startinpos + 2;
5310 break;
5311 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005312 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5313 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005314 continue;
5315 }
5316
Benjamin Peterson29060642009-01-31 22:14:21 +00005317 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005318 errors,
5319 &errorHandler,
5320 "utf16", errmsg,
5321 &starts,
5322 (const char **)&e,
5323 &startinpos,
5324 &endinpos,
5325 &exc,
5326 (const char **)&q,
5327 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005328 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 }
5331
Antoine Pitrou63065d72012-05-15 23:48:04 +02005332End:
Walter Dörwald69652032004-09-07 20:24:22 +00005333 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005334 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005335
Guido van Rossumd57fd912000-03-10 22:53:23 +00005336 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005337 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005338 goto onError;
5339
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005340 Py_XDECREF(errorHandler);
5341 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005342 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005345 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005346 Py_XDECREF(errorHandler);
5347 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 return NULL;
5349}
5350
Tim Peters772747b2001-08-09 22:21:55 +00005351PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005352_PyUnicode_EncodeUTF16(PyObject *str,
5353 const char *errors,
5354 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005356 enum PyUnicode_Kind kind;
5357 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005358 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005359 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005360 unsigned short *out;
5361 Py_ssize_t bytesize;
5362 Py_ssize_t pairs;
5363#ifdef WORDS_BIGENDIAN
5364 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005365#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005366 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005367#endif
5368
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005369 if (!PyUnicode_Check(str)) {
5370 PyErr_BadArgument();
5371 return NULL;
5372 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005373 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005374 return NULL;
5375 kind = PyUnicode_KIND(str);
5376 data = PyUnicode_DATA(str);
5377 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005378
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005379 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005380 if (kind == PyUnicode_4BYTE_KIND) {
5381 const Py_UCS4 *in = (const Py_UCS4 *)data;
5382 const Py_UCS4 *end = in + len;
5383 while (in < end)
5384 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005385 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005386 }
5387 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005389 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005390 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 if (v == NULL)
5392 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005394 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005395 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005396 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005398 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005399 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005400 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005401
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005402 switch (kind) {
5403 case PyUnicode_1BYTE_KIND: {
5404 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5405 break;
Tim Peters772747b2001-08-09 22:21:55 +00005406 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005407 case PyUnicode_2BYTE_KIND: {
5408 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5409 break;
Tim Peters772747b2001-08-09 22:21:55 +00005410 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005411 case PyUnicode_4BYTE_KIND: {
5412 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5413 break;
5414 }
5415 default:
5416 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005417 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005418
5419 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005420 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421}
5422
Alexander Belopolsky40018472011-02-26 01:02:56 +00005423PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005424PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5425 Py_ssize_t size,
5426 const char *errors,
5427 int byteorder)
5428{
5429 PyObject *result;
5430 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5431 if (tmp == NULL)
5432 return NULL;
5433 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5434 Py_DECREF(tmp);
5435 return result;
5436}
5437
5438PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005439PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005441 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442}
5443
5444/* --- Unicode Escape Codec ----------------------------------------------- */
5445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005446/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5447 if all the escapes in the string make it still a valid ASCII string.
5448 Returns -1 if any escapes were found which cause the string to
5449 pop out of ASCII range. Otherwise returns the length of the
5450 required buffer to hold the string.
5451 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005452static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005453length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5454{
5455 const unsigned char *p = (const unsigned char *)s;
5456 const unsigned char *end = p + size;
5457 Py_ssize_t length = 0;
5458
5459 if (size < 0)
5460 return -1;
5461
5462 for (; p < end; ++p) {
5463 if (*p > 127) {
5464 /* Non-ASCII */
5465 return -1;
5466 }
5467 else if (*p != '\\') {
5468 /* Normal character */
5469 ++length;
5470 }
5471 else {
5472 /* Backslash-escape, check next char */
5473 ++p;
5474 /* Escape sequence reaches till end of string or
5475 non-ASCII follow-up. */
5476 if (p >= end || *p > 127)
5477 return -1;
5478 switch (*p) {
5479 case '\n':
5480 /* backslash + \n result in zero characters */
5481 break;
5482 case '\\': case '\'': case '\"':
5483 case 'b': case 'f': case 't':
5484 case 'n': case 'r': case 'v': case 'a':
5485 ++length;
5486 break;
5487 case '0': case '1': case '2': case '3':
5488 case '4': case '5': case '6': case '7':
5489 case 'x': case 'u': case 'U': case 'N':
5490 /* these do not guarantee ASCII characters */
5491 return -1;
5492 default:
5493 /* count the backslash + the other character */
5494 length += 2;
5495 }
5496 }
5497 }
5498 return length;
5499}
5500
Fredrik Lundh06d12682001-01-24 07:59:11 +00005501static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005502
Alexander Belopolsky40018472011-02-26 01:02:56 +00005503PyObject *
5504PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005505 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005506 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005508 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005509 Py_ssize_t startinpos;
5510 Py_ssize_t endinpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005511 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005513 char* message;
5514 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005515 PyObject *errorHandler = NULL;
5516 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005517 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005518 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005519
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005520 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005521
5522 /* After length_of_escaped_ascii_string() there are two alternatives,
5523 either the string is pure ASCII with named escapes like \n, etc.
5524 and we determined it's exact size (common case)
5525 or it contains \x, \u, ... escape sequences. then we create a
5526 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005527 if (len >= 0) {
5528 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005529 if (!v)
5530 goto onError;
5531 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005532 }
5533 else {
5534 /* Escaped strings will always be longer than the resulting
5535 Unicode string, so we start with size here and then reduce the
5536 length after conversion to the true value.
5537 (but if the error callback returns a long replacement string
5538 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005539 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005540 if (!v)
5541 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005542 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005543 }
5544
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005546 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005547 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005549
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550 while (s < end) {
5551 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005552 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005553 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005555 /* The only case in which i == ascii_length is a backslash
5556 followed by a newline. */
5557 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005558
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 /* Non-escape characters are interpreted as Unicode ordinals */
5560 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005561 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5562 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 continue;
5564 }
5565
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005566 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 /* \ - Escapes */
5568 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005569 c = *s++;
5570 if (s > end)
5571 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005572
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005573 /* The only case in which i == ascii_length is a backslash
5574 followed by a newline. */
5575 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005576
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005577 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578
Benjamin Peterson29060642009-01-31 22:14:21 +00005579 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005580#define WRITECHAR(ch) \
5581 do { \
5582 if (unicode_putchar(&v, &i, ch) < 0) \
5583 goto onError; \
5584 }while(0)
5585
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005587 case '\\': WRITECHAR('\\'); break;
5588 case '\'': WRITECHAR('\''); break;
5589 case '\"': WRITECHAR('\"'); break;
5590 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005591 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005592 case 'f': WRITECHAR('\014'); break;
5593 case 't': WRITECHAR('\t'); break;
5594 case 'n': WRITECHAR('\n'); break;
5595 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005596 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005597 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005598 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005599 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600
Benjamin Peterson29060642009-01-31 22:14:21 +00005601 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602 case '0': case '1': case '2': case '3':
5603 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005604 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005605 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005606 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005607 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005608 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005610 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 break;
5612
Benjamin Peterson29060642009-01-31 22:14:21 +00005613 /* hex escapes */
5614 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005616 digits = 2;
5617 message = "truncated \\xXX escape";
5618 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619
Benjamin Peterson29060642009-01-31 22:14:21 +00005620 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005621 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005622 digits = 4;
5623 message = "truncated \\uXXXX escape";
5624 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625
Benjamin Peterson29060642009-01-31 22:14:21 +00005626 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005627 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005628 digits = 8;
5629 message = "truncated \\UXXXXXXXX escape";
5630 hexescape:
5631 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005632 if (end - s < digits) {
5633 /* count only hex digits */
5634 for (; s < end; ++s) {
5635 c = (unsigned char)*s;
5636 if (!Py_ISXDIGIT(c))
5637 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005638 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005639 goto error;
5640 }
5641 for (; digits--; ++s) {
5642 c = (unsigned char)*s;
5643 if (!Py_ISXDIGIT(c))
5644 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005645 chr = (chr<<4) & ~0xF;
5646 if (c >= '0' && c <= '9')
5647 chr += c - '0';
5648 else if (c >= 'a' && c <= 'f')
5649 chr += 10 + c - 'a';
5650 else
5651 chr += 10 + c - 'A';
5652 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005653 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005654 /* _decoding_error will have already written into the
5655 target buffer. */
5656 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005657 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005658 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005659 message = "illegal Unicode character";
5660 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005661 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005662 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005663 break;
5664
Benjamin Peterson29060642009-01-31 22:14:21 +00005665 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005666 case 'N':
5667 message = "malformed \\N character escape";
5668 if (ucnhash_CAPI == NULL) {
5669 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005670 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5671 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005672 if (ucnhash_CAPI == NULL)
5673 goto ucnhashError;
5674 }
5675 if (*s == '{') {
5676 const char *start = s+1;
5677 /* look for the closing brace */
5678 while (*s != '}' && s < end)
5679 s++;
5680 if (s > start && s < end && *s == '}') {
5681 /* found a name. look it up in the unicode database */
5682 message = "unknown Unicode character name";
5683 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005684 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005685 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005686 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005687 goto store;
5688 }
5689 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005690 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005691
5692 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005693 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005694 message = "\\ at end of string";
5695 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005696 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005697 }
5698 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005699 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005700 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005701 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005702 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005704 continue;
5705
5706 error:
5707 endinpos = s-starts;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005708 if (unicode_decode_call_errorhandler(
5709 errors, &errorHandler,
5710 "unicodeescape", message,
5711 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005712 &v, &i))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005713 goto onError;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005714 len = PyUnicode_GET_LENGTH(v);
Serhiy Storchakad6793772013-01-29 10:20:44 +02005715 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005716 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005717#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005718
Victor Stinner16e6a802011-12-12 13:24:15 +01005719 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005720 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005721 Py_XDECREF(errorHandler);
5722 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005723 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005724
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005726 PyErr_SetString(
5727 PyExc_UnicodeError,
5728 "\\N escapes not supported (can't load unicodedata module)"
5729 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005730 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005731 Py_XDECREF(errorHandler);
5732 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005733 return NULL;
5734
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005737 Py_XDECREF(errorHandler);
5738 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739 return NULL;
5740}
5741
5742/* Return a Unicode-Escape string version of the Unicode object.
5743
5744 If quotes is true, the string is enclosed in u"" or u'' quotes as
5745 appropriate.
5746
5747*/
5748
Alexander Belopolsky40018472011-02-26 01:02:56 +00005749PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005750PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005752 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005753 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005755 int kind;
5756 void *data;
5757 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758
Ezio Melottie7f90372012-10-05 03:33:31 +03005759 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005760 escape.
5761
Ezio Melottie7f90372012-10-05 03:33:31 +03005762 For UCS1 strings it's '\xxx', 4 bytes per source character.
5763 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5764 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005765 */
5766
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005767 if (!PyUnicode_Check(unicode)) {
5768 PyErr_BadArgument();
5769 return NULL;
5770 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005771 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005772 return NULL;
5773 len = PyUnicode_GET_LENGTH(unicode);
5774 kind = PyUnicode_KIND(unicode);
5775 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005776 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005777 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5778 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5779 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5780 }
5781
5782 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005783 return PyBytes_FromStringAndSize(NULL, 0);
5784
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005785 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005787
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005788 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005789 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005790 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792 if (repr == NULL)
5793 return NULL;
5794
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005795 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005797 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005798 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005799
Walter Dörwald79e913e2007-05-12 11:08:06 +00005800 /* Escape backslashes */
5801 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802 *p++ = '\\';
5803 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005804 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005805 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005806
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005807 /* Map 21-bit characters to '\U00xxxxxx' */
5808 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005809 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005810 *p++ = '\\';
5811 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005812 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5813 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5814 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5815 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5816 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5817 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5818 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5819 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005821 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005822
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005824 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005825 *p++ = '\\';
5826 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005827 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5828 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5829 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5830 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005832
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005833 /* Map special whitespace to '\t', \n', '\r' */
5834 else if (ch == '\t') {
5835 *p++ = '\\';
5836 *p++ = 't';
5837 }
5838 else if (ch == '\n') {
5839 *p++ = '\\';
5840 *p++ = 'n';
5841 }
5842 else if (ch == '\r') {
5843 *p++ = '\\';
5844 *p++ = 'r';
5845 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005846
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005847 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005848 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005850 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005851 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5852 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005853 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005854
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855 /* Copy everything else as-is */
5856 else
5857 *p++ = (char) ch;
5858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005860 assert(p - PyBytes_AS_STRING(repr) > 0);
5861 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5862 return NULL;
5863 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864}
5865
Alexander Belopolsky40018472011-02-26 01:02:56 +00005866PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005867PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5868 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005870 PyObject *result;
5871 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5872 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005874 result = PyUnicode_AsUnicodeEscapeString(tmp);
5875 Py_DECREF(tmp);
5876 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877}
5878
5879/* --- Raw Unicode Escape Codec ------------------------------------------- */
5880
Alexander Belopolsky40018472011-02-26 01:02:56 +00005881PyObject *
5882PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005883 Py_ssize_t size,
5884 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005885{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005886 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005887 Py_ssize_t startinpos;
5888 Py_ssize_t endinpos;
5889 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005890 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005891 const char *end;
5892 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005893 PyObject *errorHandler = NULL;
5894 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005895
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 /* Escaped strings will always be longer than the resulting
5897 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 length after conversion to the true value. (But decoding error
5899 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005900 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005902 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005903 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005904 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005905 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906 end = s + size;
5907 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005908 unsigned char c;
5909 Py_UCS4 x;
5910 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005911 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 /* Non-escape characters are interpreted as Unicode ordinals */
5914 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005915 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5916 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005918 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005919 startinpos = s-starts;
5920
5921 /* \u-escapes are only interpreted iff the number of leading
5922 backslashes if odd */
5923 bs = s;
5924 for (;s < end;) {
5925 if (*s != '\\')
5926 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005927 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5928 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 }
5930 if (((s - bs) & 1) == 0 ||
5931 s >= end ||
5932 (*s != 'u' && *s != 'U')) {
5933 continue;
5934 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005935 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 count = *s=='u' ? 4 : 8;
5937 s++;
5938
5939 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 for (x = 0, i = 0; i < count; ++i, ++s) {
5941 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005942 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005943 endinpos = s-starts;
5944 if (unicode_decode_call_errorhandler(
5945 errors, &errorHandler,
5946 "rawunicodeescape", "truncated \\uXXXX",
5947 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005948 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 goto onError;
5950 goto nextByte;
5951 }
5952 x = (x<<4) & ~0xF;
5953 if (c >= '0' && c <= '9')
5954 x += c - '0';
5955 else if (c >= 'a' && c <= 'f')
5956 x += 10 + c - 'a';
5957 else
5958 x += 10 + c - 'A';
5959 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005960 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005961 if (unicode_putchar(&v, &outpos, x) < 0)
5962 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005963 } else {
5964 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005965 if (unicode_decode_call_errorhandler(
5966 errors, &errorHandler,
5967 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005968 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005969 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005970 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005971 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 nextByte:
5973 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 }
Victor Stinner16e6a802011-12-12 13:24:15 +01005975 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005977 Py_XDECREF(errorHandler);
5978 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005979 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00005980
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005982 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005983 Py_XDECREF(errorHandler);
5984 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 return NULL;
5986}
5987
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005988
Alexander Belopolsky40018472011-02-26 01:02:56 +00005989PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005990PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005992 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 char *p;
5994 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005995 Py_ssize_t expandsize, pos;
5996 int kind;
5997 void *data;
5998 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006000 if (!PyUnicode_Check(unicode)) {
6001 PyErr_BadArgument();
6002 return NULL;
6003 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006004 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006005 return NULL;
6006 kind = PyUnicode_KIND(unicode);
6007 data = PyUnicode_DATA(unicode);
6008 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006009 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6010 bytes, and 1 byte characters 4. */
6011 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006012
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006013 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006015
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006016 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 if (repr == NULL)
6018 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006019 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006020 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006022 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006023 for (pos = 0; pos < len; pos++) {
6024 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 /* Map 32-bit characters to '\Uxxxxxxxx' */
6026 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006027 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006028 *p++ = '\\';
6029 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006030 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6031 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6032 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6033 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6034 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6035 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6036 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6037 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006038 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006039 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006040 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 *p++ = '\\';
6042 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006043 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6044 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6045 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6046 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 /* Copy everything else as-is */
6049 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 *p++ = (char) ch;
6051 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006052
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006053 assert(p > q);
6054 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006055 return NULL;
6056 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057}
6058
Alexander Belopolsky40018472011-02-26 01:02:56 +00006059PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006060PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6061 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006063 PyObject *result;
6064 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6065 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006066 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006067 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6068 Py_DECREF(tmp);
6069 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070}
6071
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006072/* --- Unicode Internal Codec ------------------------------------------- */
6073
Alexander Belopolsky40018472011-02-26 01:02:56 +00006074PyObject *
6075_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006076 Py_ssize_t size,
6077 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006078{
6079 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006080 Py_ssize_t startinpos;
6081 Py_ssize_t endinpos;
6082 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006083 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006084 const char *end;
6085 const char *reason;
6086 PyObject *errorHandler = NULL;
6087 PyObject *exc = NULL;
6088
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006089 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006090 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006091 1))
6092 return NULL;
6093
Thomas Wouters89f507f2006-12-13 04:49:30 +00006094 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006095 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006096 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006098 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006099 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006100 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006101 end = s + size;
6102
6103 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006104 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006105 Py_UCS4 ch;
6106 /* We copy the raw representation one byte at a time because the
6107 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006108 ((char *) &uch)[0] = s[0];
6109 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006110#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006111 ((char *) &uch)[2] = s[2];
6112 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006113#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006114 ch = uch;
6115
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006116 /* We have to sanity check the raw data, otherwise doom looms for
6117 some malformed UCS-4 data. */
6118 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006119#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006120 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006121#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006122 end-s < Py_UNICODE_SIZE
6123 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006125 startinpos = s - starts;
6126 if (end-s < Py_UNICODE_SIZE) {
6127 endinpos = end-starts;
6128 reason = "truncated input";
6129 }
6130 else {
6131 endinpos = s - starts + Py_UNICODE_SIZE;
6132 reason = "illegal code point (> 0x10FFFF)";
6133 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006134 if (unicode_decode_call_errorhandler(
6135 errors, &errorHandler,
6136 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006137 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006138 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006139 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006140 continue;
6141 }
6142
6143 s += Py_UNICODE_SIZE;
6144#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006145 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006146 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006147 Py_UNICODE uch2;
6148 ((char *) &uch2)[0] = s[0];
6149 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006150 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006151 {
Victor Stinner551ac952011-11-29 22:58:13 +01006152 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006153 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006154 }
6155 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006156#endif
6157
6158 if (unicode_putchar(&v, &outpos, ch) < 0)
6159 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006160 }
6161
Victor Stinner16e6a802011-12-12 13:24:15 +01006162 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006163 goto onError;
6164 Py_XDECREF(errorHandler);
6165 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006166 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006167
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006169 Py_XDECREF(v);
6170 Py_XDECREF(errorHandler);
6171 Py_XDECREF(exc);
6172 return NULL;
6173}
6174
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175/* --- Latin-1 Codec ------------------------------------------------------ */
6176
Alexander Belopolsky40018472011-02-26 01:02:56 +00006177PyObject *
6178PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006179 Py_ssize_t size,
6180 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006183 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184}
6185
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006186/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006187static void
6188make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006189 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006190 PyObject *unicode,
6191 Py_ssize_t startpos, Py_ssize_t endpos,
6192 const char *reason)
6193{
6194 if (*exceptionObject == NULL) {
6195 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006196 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006197 encoding, unicode, startpos, endpos, reason);
6198 }
6199 else {
6200 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6201 goto onError;
6202 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6203 goto onError;
6204 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6205 goto onError;
6206 return;
6207 onError:
6208 Py_DECREF(*exceptionObject);
6209 *exceptionObject = NULL;
6210 }
6211}
6212
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006213/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006214static void
6215raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006216 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006217 PyObject *unicode,
6218 Py_ssize_t startpos, Py_ssize_t endpos,
6219 const char *reason)
6220{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006221 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006222 encoding, unicode, startpos, endpos, reason);
6223 if (*exceptionObject != NULL)
6224 PyCodec_StrictErrors(*exceptionObject);
6225}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006226
6227/* error handling callback helper:
6228 build arguments, call the callback and check the arguments,
6229 put the result into newpos and return the replacement string, which
6230 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006231static PyObject *
6232unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006233 PyObject **errorHandler,
6234 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006235 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006236 Py_ssize_t startpos, Py_ssize_t endpos,
6237 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006238{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006239 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006240 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006241 PyObject *restuple;
6242 PyObject *resunicode;
6243
6244 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006246 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006247 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006248 }
6249
Benjamin Petersonbac79492012-01-14 13:34:47 -05006250 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006251 return NULL;
6252 len = PyUnicode_GET_LENGTH(unicode);
6253
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006254 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006255 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006256 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006258
6259 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006261 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006263 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006264 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006265 Py_DECREF(restuple);
6266 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006267 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006268 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 &resunicode, newpos)) {
6270 Py_DECREF(restuple);
6271 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006272 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006273 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6274 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6275 Py_DECREF(restuple);
6276 return NULL;
6277 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006278 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006279 *newpos = len + *newpos;
6280 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6282 Py_DECREF(restuple);
6283 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006284 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006285 Py_INCREF(resunicode);
6286 Py_DECREF(restuple);
6287 return resunicode;
6288}
6289
Alexander Belopolsky40018472011-02-26 01:02:56 +00006290static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006291unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006292 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006293 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006294{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006295 /* input state */
6296 Py_ssize_t pos=0, size;
6297 int kind;
6298 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006299 /* output object */
6300 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006301 /* pointer into the output */
6302 char *str;
6303 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006304 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006305 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6306 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006307 PyObject *errorHandler = NULL;
6308 PyObject *exc = NULL;
6309 /* the following variable is used for caching string comparisons
6310 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6311 int known_errorHandler = -1;
6312
Benjamin Petersonbac79492012-01-14 13:34:47 -05006313 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006314 return NULL;
6315 size = PyUnicode_GET_LENGTH(unicode);
6316 kind = PyUnicode_KIND(unicode);
6317 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006318 /* allocate enough for a simple encoding without
6319 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006320 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006321 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006322 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006323 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006324 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006325 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006326 ressize = size;
6327
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006328 while (pos < size) {
6329 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006330
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 /* can we encode this? */
6332 if (c<limit) {
6333 /* no overflow check, because we know that the space is enough */
6334 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006335 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006336 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 Py_ssize_t requiredsize;
6339 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006340 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006342 Py_ssize_t collstart = pos;
6343 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006345 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 ++collend;
6347 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6348 if (known_errorHandler==-1) {
6349 if ((errors==NULL) || (!strcmp(errors, "strict")))
6350 known_errorHandler = 1;
6351 else if (!strcmp(errors, "replace"))
6352 known_errorHandler = 2;
6353 else if (!strcmp(errors, "ignore"))
6354 known_errorHandler = 3;
6355 else if (!strcmp(errors, "xmlcharrefreplace"))
6356 known_errorHandler = 4;
6357 else
6358 known_errorHandler = 0;
6359 }
6360 switch (known_errorHandler) {
6361 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006362 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 goto onError;
6364 case 2: /* replace */
6365 while (collstart++<collend)
6366 *str++ = '?'; /* fall through */
6367 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006368 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 break;
6370 case 4: /* xmlcharrefreplace */
6371 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006372 /* determine replacement size */
6373 for (i = collstart, repsize = 0; i < collend; ++i) {
6374 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6375 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006377 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006379 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006381 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006383 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006385 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006387 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006388 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006390 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006391 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006392 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006393 if (requiredsize > ressize) {
6394 if (requiredsize<2*ressize)
6395 requiredsize = 2*ressize;
6396 if (_PyBytes_Resize(&res, requiredsize))
6397 goto onError;
6398 str = PyBytes_AS_STRING(res) + respos;
6399 ressize = requiredsize;
6400 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006401 /* generate replacement */
6402 for (i = collstart; i < collend; ++i) {
6403 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006405 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 break;
6407 default:
6408 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006409 encoding, reason, unicode, &exc,
6410 collstart, collend, &newpos);
6411 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006412 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006414 if (PyBytes_Check(repunicode)) {
6415 /* Directly copy bytes result to output. */
6416 repsize = PyBytes_Size(repunicode);
6417 if (repsize > 1) {
6418 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006419 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006420 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6421 Py_DECREF(repunicode);
6422 goto onError;
6423 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006424 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006425 ressize += repsize-1;
6426 }
6427 memcpy(str, PyBytes_AsString(repunicode), repsize);
6428 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006429 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006430 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006431 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006432 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 /* need more space? (at least enough for what we
6434 have+the replacement+the rest of the string, so
6435 we won't have to check space for encodable characters) */
6436 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006437 repsize = PyUnicode_GET_LENGTH(repunicode);
6438 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 if (requiredsize > ressize) {
6440 if (requiredsize<2*ressize)
6441 requiredsize = 2*ressize;
6442 if (_PyBytes_Resize(&res, requiredsize)) {
6443 Py_DECREF(repunicode);
6444 goto onError;
6445 }
6446 str = PyBytes_AS_STRING(res) + respos;
6447 ressize = requiredsize;
6448 }
6449 /* check if there is anything unencodable in the replacement
6450 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006451 for (i = 0; repsize-->0; ++i, ++str) {
6452 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006454 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006455 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 Py_DECREF(repunicode);
6457 goto onError;
6458 }
6459 *str = (char)c;
6460 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006461 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006462 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006463 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006464 }
6465 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006466 /* Resize if we allocated to much */
6467 size = str - PyBytes_AS_STRING(res);
6468 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006469 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006470 if (_PyBytes_Resize(&res, size) < 0)
6471 goto onError;
6472 }
6473
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006474 Py_XDECREF(errorHandler);
6475 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006476 return res;
6477
6478 onError:
6479 Py_XDECREF(res);
6480 Py_XDECREF(errorHandler);
6481 Py_XDECREF(exc);
6482 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483}
6484
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006485/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006486PyObject *
6487PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006488 Py_ssize_t size,
6489 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006491 PyObject *result;
6492 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6493 if (unicode == NULL)
6494 return NULL;
6495 result = unicode_encode_ucs1(unicode, errors, 256);
6496 Py_DECREF(unicode);
6497 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498}
6499
Alexander Belopolsky40018472011-02-26 01:02:56 +00006500PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006501_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502{
6503 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 PyErr_BadArgument();
6505 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006507 if (PyUnicode_READY(unicode) == -1)
6508 return NULL;
6509 /* Fast path: if it is a one-byte string, construct
6510 bytes object directly. */
6511 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6512 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6513 PyUnicode_GET_LENGTH(unicode));
6514 /* Non-Latin-1 characters present. Defer to above function to
6515 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006516 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006517}
6518
6519PyObject*
6520PyUnicode_AsLatin1String(PyObject *unicode)
6521{
6522 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523}
6524
6525/* --- 7-bit ASCII Codec -------------------------------------------------- */
6526
Alexander Belopolsky40018472011-02-26 01:02:56 +00006527PyObject *
6528PyUnicode_DecodeASCII(const char *s,
6529 Py_ssize_t size,
6530 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006532 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006533 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006534 int kind;
6535 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006536 Py_ssize_t startinpos;
6537 Py_ssize_t endinpos;
6538 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006539 const char *e;
6540 PyObject *errorHandler = NULL;
6541 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006542
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006544 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006545
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006547 if (size == 1 && (unsigned char)s[0] < 128)
6548 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006549
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006550 unicode = PyUnicode_New(size, 127);
6551 if (unicode == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006554 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006555 data = PyUnicode_1BYTE_DATA(unicode);
6556 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6557 if (outpos == size)
6558 return unicode;
6559
6560 s += outpos;
6561 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006562 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 register unsigned char c = (unsigned char)*s;
6564 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006565 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 ++s;
6567 }
6568 else {
6569 startinpos = s-starts;
6570 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 if (unicode_decode_call_errorhandler(
6572 errors, &errorHandler,
6573 "ascii", "ordinal not in range(128)",
6574 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006575 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006577 kind = PyUnicode_KIND(unicode);
6578 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006581 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006582 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006583 Py_XDECREF(errorHandler);
6584 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006585 assert(_PyUnicode_CheckConsistency(unicode, 1));
6586 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006587
Benjamin Peterson29060642009-01-31 22:14:21 +00006588 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006589 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006590 Py_XDECREF(errorHandler);
6591 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 return NULL;
6593}
6594
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006595/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006596PyObject *
6597PyUnicode_EncodeASCII(const Py_UNICODE *p,
6598 Py_ssize_t size,
6599 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006601 PyObject *result;
6602 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6603 if (unicode == NULL)
6604 return NULL;
6605 result = unicode_encode_ucs1(unicode, errors, 128);
6606 Py_DECREF(unicode);
6607 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608}
6609
Alexander Belopolsky40018472011-02-26 01:02:56 +00006610PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006611_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612{
6613 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 PyErr_BadArgument();
6615 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006617 if (PyUnicode_READY(unicode) == -1)
6618 return NULL;
6619 /* Fast path: if it is an ASCII-only string, construct bytes object
6620 directly. Else defer to above function to raise the exception. */
6621 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6622 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6623 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006624 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006625}
6626
6627PyObject *
6628PyUnicode_AsASCIIString(PyObject *unicode)
6629{
6630 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631}
6632
Victor Stinner99b95382011-07-04 14:23:54 +02006633#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006634
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006635/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006636
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006637#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006638#define NEED_RETRY
6639#endif
6640
Victor Stinner3a50e702011-10-18 21:21:00 +02006641#ifndef WC_ERR_INVALID_CHARS
6642# define WC_ERR_INVALID_CHARS 0x0080
6643#endif
6644
6645static char*
6646code_page_name(UINT code_page, PyObject **obj)
6647{
6648 *obj = NULL;
6649 if (code_page == CP_ACP)
6650 return "mbcs";
6651 if (code_page == CP_UTF7)
6652 return "CP_UTF7";
6653 if (code_page == CP_UTF8)
6654 return "CP_UTF8";
6655
6656 *obj = PyBytes_FromFormat("cp%u", code_page);
6657 if (*obj == NULL)
6658 return NULL;
6659 return PyBytes_AS_STRING(*obj);
6660}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006661
Alexander Belopolsky40018472011-02-26 01:02:56 +00006662static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006663is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006664{
6665 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006666 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006667
Victor Stinner3a50e702011-10-18 21:21:00 +02006668 if (!IsDBCSLeadByteEx(code_page, *curr))
6669 return 0;
6670
6671 prev = CharPrevExA(code_page, s, curr, 0);
6672 if (prev == curr)
6673 return 1;
6674 /* FIXME: This code is limited to "true" double-byte encodings,
6675 as it assumes an incomplete character consists of a single
6676 byte. */
6677 if (curr - prev == 2)
6678 return 1;
6679 if (!IsDBCSLeadByteEx(code_page, *prev))
6680 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006681 return 0;
6682}
6683
Victor Stinner3a50e702011-10-18 21:21:00 +02006684static DWORD
6685decode_code_page_flags(UINT code_page)
6686{
6687 if (code_page == CP_UTF7) {
6688 /* The CP_UTF7 decoder only supports flags=0 */
6689 return 0;
6690 }
6691 else
6692 return MB_ERR_INVALID_CHARS;
6693}
6694
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006695/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006696 * Decode a byte string from a Windows code page into unicode object in strict
6697 * mode.
6698 *
6699 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6700 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006701 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006702static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006703decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006704 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006705 const char *in,
6706 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006707{
Victor Stinner3a50e702011-10-18 21:21:00 +02006708 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006709 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006710 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006711
6712 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006713 assert(insize > 0);
6714 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6715 if (outsize <= 0)
6716 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006717
6718 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006720 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006721 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 if (*v == NULL)
6723 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006724 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006725 }
6726 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006728 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006729 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006731 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006732 }
6733
6734 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006735 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6736 if (outsize <= 0)
6737 goto error;
6738 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006739
Victor Stinner3a50e702011-10-18 21:21:00 +02006740error:
6741 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6742 return -2;
6743 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006744 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006745}
6746
Victor Stinner3a50e702011-10-18 21:21:00 +02006747/*
6748 * Decode a byte string from a code page into unicode object with an error
6749 * handler.
6750 *
6751 * Returns consumed size if succeed, or raise a WindowsError or
6752 * UnicodeDecodeError exception and returns -1 on error.
6753 */
6754static int
6755decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006756 PyObject **v,
6757 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006758 const char *errors)
6759{
6760 const char *startin = in;
6761 const char *endin = in + size;
6762 const DWORD flags = decode_code_page_flags(code_page);
6763 /* Ideally, we should get reason from FormatMessage. This is the Windows
6764 2000 English version of the message. */
6765 const char *reason = "No mapping for the Unicode character exists "
6766 "in the target code page.";
6767 /* each step cannot decode more than 1 character, but a character can be
6768 represented as a surrogate pair */
6769 wchar_t buffer[2], *startout, *out;
6770 int insize, outsize;
6771 PyObject *errorHandler = NULL;
6772 PyObject *exc = NULL;
6773 PyObject *encoding_obj = NULL;
6774 char *encoding;
6775 DWORD err;
6776 int ret = -1;
6777
6778 assert(size > 0);
6779
6780 encoding = code_page_name(code_page, &encoding_obj);
6781 if (encoding == NULL)
6782 return -1;
6783
6784 if (errors == NULL || strcmp(errors, "strict") == 0) {
6785 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6786 UnicodeDecodeError. */
6787 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6788 if (exc != NULL) {
6789 PyCodec_StrictErrors(exc);
6790 Py_CLEAR(exc);
6791 }
6792 goto error;
6793 }
6794
6795 if (*v == NULL) {
6796 /* Create unicode object */
6797 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6798 PyErr_NoMemory();
6799 goto error;
6800 }
Victor Stinnerab595942011-12-17 04:59:06 +01006801 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006802 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006803 if (*v == NULL)
6804 goto error;
6805 startout = PyUnicode_AS_UNICODE(*v);
6806 }
6807 else {
6808 /* Extend unicode object */
6809 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6810 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6811 PyErr_NoMemory();
6812 goto error;
6813 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006814 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006815 goto error;
6816 startout = PyUnicode_AS_UNICODE(*v) + n;
6817 }
6818
6819 /* Decode the byte string character per character */
6820 out = startout;
6821 while (in < endin)
6822 {
6823 /* Decode a character */
6824 insize = 1;
6825 do
6826 {
6827 outsize = MultiByteToWideChar(code_page, flags,
6828 in, insize,
6829 buffer, Py_ARRAY_LENGTH(buffer));
6830 if (outsize > 0)
6831 break;
6832 err = GetLastError();
6833 if (err != ERROR_NO_UNICODE_TRANSLATION
6834 && err != ERROR_INSUFFICIENT_BUFFER)
6835 {
6836 PyErr_SetFromWindowsErr(0);
6837 goto error;
6838 }
6839 insize++;
6840 }
6841 /* 4=maximum length of a UTF-8 sequence */
6842 while (insize <= 4 && (in + insize) <= endin);
6843
6844 if (outsize <= 0) {
6845 Py_ssize_t startinpos, endinpos, outpos;
6846
6847 startinpos = in - startin;
6848 endinpos = startinpos + 1;
6849 outpos = out - PyUnicode_AS_UNICODE(*v);
6850 if (unicode_decode_call_errorhandler(
6851 errors, &errorHandler,
6852 encoding, reason,
6853 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006854 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006855 {
6856 goto error;
6857 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006858 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006859 }
6860 else {
6861 in += insize;
6862 memcpy(out, buffer, outsize * sizeof(wchar_t));
6863 out += outsize;
6864 }
6865 }
6866
6867 /* write a NUL character at the end */
6868 *out = 0;
6869
6870 /* Extend unicode object */
6871 outsize = out - startout;
6872 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006873 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006874 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006875 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006876
6877error:
6878 Py_XDECREF(encoding_obj);
6879 Py_XDECREF(errorHandler);
6880 Py_XDECREF(exc);
6881 return ret;
6882}
6883
Victor Stinner3a50e702011-10-18 21:21:00 +02006884static PyObject *
6885decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006886 const char *s, Py_ssize_t size,
6887 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006888{
Victor Stinner76a31a62011-11-04 00:05:13 +01006889 PyObject *v = NULL;
6890 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006891
Victor Stinner3a50e702011-10-18 21:21:00 +02006892 if (code_page < 0) {
6893 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6894 return NULL;
6895 }
6896
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006897 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006898 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006899
Victor Stinner76a31a62011-11-04 00:05:13 +01006900 do
6901 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006902#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006903 if (size > INT_MAX) {
6904 chunk_size = INT_MAX;
6905 final = 0;
6906 done = 0;
6907 }
6908 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006909#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006910 {
6911 chunk_size = (int)size;
6912 final = (consumed == NULL);
6913 done = 1;
6914 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006915
Victor Stinner76a31a62011-11-04 00:05:13 +01006916 /* Skip trailing lead-byte unless 'final' is set */
6917 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6918 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006919
Victor Stinner76a31a62011-11-04 00:05:13 +01006920 if (chunk_size == 0 && done) {
6921 if (v != NULL)
6922 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006923 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006924 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006925
Victor Stinner76a31a62011-11-04 00:05:13 +01006926
6927 converted = decode_code_page_strict(code_page, &v,
6928 s, chunk_size);
6929 if (converted == -2)
6930 converted = decode_code_page_errors(code_page, &v,
6931 s, chunk_size,
6932 errors);
6933 assert(converted != 0);
6934
6935 if (converted < 0) {
6936 Py_XDECREF(v);
6937 return NULL;
6938 }
6939
6940 if (consumed)
6941 *consumed += converted;
6942
6943 s += converted;
6944 size -= converted;
6945 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006946
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006947 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006948}
6949
Alexander Belopolsky40018472011-02-26 01:02:56 +00006950PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006951PyUnicode_DecodeCodePageStateful(int code_page,
6952 const char *s,
6953 Py_ssize_t size,
6954 const char *errors,
6955 Py_ssize_t *consumed)
6956{
6957 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6958}
6959
6960PyObject *
6961PyUnicode_DecodeMBCSStateful(const char *s,
6962 Py_ssize_t size,
6963 const char *errors,
6964 Py_ssize_t *consumed)
6965{
6966 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6967}
6968
6969PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006970PyUnicode_DecodeMBCS(const char *s,
6971 Py_ssize_t size,
6972 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006973{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006974 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6975}
6976
Victor Stinner3a50e702011-10-18 21:21:00 +02006977static DWORD
6978encode_code_page_flags(UINT code_page, const char *errors)
6979{
6980 if (code_page == CP_UTF8) {
6981 if (winver.dwMajorVersion >= 6)
6982 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6983 and later */
6984 return WC_ERR_INVALID_CHARS;
6985 else
6986 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6987 return 0;
6988 }
6989 else if (code_page == CP_UTF7) {
6990 /* CP_UTF7 only supports flags=0 */
6991 return 0;
6992 }
6993 else {
6994 if (errors != NULL && strcmp(errors, "replace") == 0)
6995 return 0;
6996 else
6997 return WC_NO_BEST_FIT_CHARS;
6998 }
6999}
7000
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007001/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007002 * Encode a Unicode string to a Windows code page into a byte string in strict
7003 * mode.
7004 *
7005 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7006 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007007 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007008static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007009encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007010 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007011 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007012{
Victor Stinner554f3f02010-06-16 23:33:54 +00007013 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007014 BOOL *pusedDefaultChar = &usedDefaultChar;
7015 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007016 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007017 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007018 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007019 const DWORD flags = encode_code_page_flags(code_page, NULL);
7020 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007021 /* Create a substring so that we can get the UTF-16 representation
7022 of just the slice under consideration. */
7023 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007024
Martin v. Löwis3d325192011-11-04 18:23:06 +01007025 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007026
Victor Stinner3a50e702011-10-18 21:21:00 +02007027 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007028 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007029 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007030 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007031
Victor Stinner2fc507f2011-11-04 20:06:39 +01007032 substring = PyUnicode_Substring(unicode, offset, offset+len);
7033 if (substring == NULL)
7034 return -1;
7035 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7036 if (p == NULL) {
7037 Py_DECREF(substring);
7038 return -1;
7039 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007040
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007041 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007042 outsize = WideCharToMultiByte(code_page, flags,
7043 p, size,
7044 NULL, 0,
7045 NULL, pusedDefaultChar);
7046 if (outsize <= 0)
7047 goto error;
7048 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007049 if (pusedDefaultChar && *pusedDefaultChar) {
7050 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007051 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007052 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007053
Victor Stinner3a50e702011-10-18 21:21:00 +02007054 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007056 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007057 if (*outbytes == NULL) {
7058 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007060 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007061 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007062 }
7063 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007064 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007065 const Py_ssize_t n = PyBytes_Size(*outbytes);
7066 if (outsize > PY_SSIZE_T_MAX - n) {
7067 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007068 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007070 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007071 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7072 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007073 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007074 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007075 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007076 }
7077
7078 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007079 outsize = WideCharToMultiByte(code_page, flags,
7080 p, size,
7081 out, outsize,
7082 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007083 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007084 if (outsize <= 0)
7085 goto error;
7086 if (pusedDefaultChar && *pusedDefaultChar)
7087 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007089
Victor Stinner3a50e702011-10-18 21:21:00 +02007090error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007091 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007092 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7093 return -2;
7094 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007095 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007096}
7097
Victor Stinner3a50e702011-10-18 21:21:00 +02007098/*
7099 * Encode a Unicode string to a Windows code page into a byte string using a
7100 * error handler.
7101 *
7102 * Returns consumed characters if succeed, or raise a WindowsError and returns
7103 * -1 on other error.
7104 */
7105static int
7106encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007107 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007108 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007109{
Victor Stinner3a50e702011-10-18 21:21:00 +02007110 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007111 Py_ssize_t pos = unicode_offset;
7112 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007113 /* Ideally, we should get reason from FormatMessage. This is the Windows
7114 2000 English version of the message. */
7115 const char *reason = "invalid character";
7116 /* 4=maximum length of a UTF-8 sequence */
7117 char buffer[4];
7118 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7119 Py_ssize_t outsize;
7120 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 PyObject *errorHandler = NULL;
7122 PyObject *exc = NULL;
7123 PyObject *encoding_obj = NULL;
7124 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007125 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007126 PyObject *rep;
7127 int ret = -1;
7128
7129 assert(insize > 0);
7130
7131 encoding = code_page_name(code_page, &encoding_obj);
7132 if (encoding == NULL)
7133 return -1;
7134
7135 if (errors == NULL || strcmp(errors, "strict") == 0) {
7136 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7137 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007138 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 if (exc != NULL) {
7140 PyCodec_StrictErrors(exc);
7141 Py_DECREF(exc);
7142 }
7143 Py_XDECREF(encoding_obj);
7144 return -1;
7145 }
7146
7147 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7148 pusedDefaultChar = &usedDefaultChar;
7149 else
7150 pusedDefaultChar = NULL;
7151
7152 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7153 PyErr_NoMemory();
7154 goto error;
7155 }
7156 outsize = insize * Py_ARRAY_LENGTH(buffer);
7157
7158 if (*outbytes == NULL) {
7159 /* Create string object */
7160 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7161 if (*outbytes == NULL)
7162 goto error;
7163 out = PyBytes_AS_STRING(*outbytes);
7164 }
7165 else {
7166 /* Extend string object */
7167 Py_ssize_t n = PyBytes_Size(*outbytes);
7168 if (n > PY_SSIZE_T_MAX - outsize) {
7169 PyErr_NoMemory();
7170 goto error;
7171 }
7172 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7173 goto error;
7174 out = PyBytes_AS_STRING(*outbytes) + n;
7175 }
7176
7177 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007178 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007180 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7181 wchar_t chars[2];
7182 int charsize;
7183 if (ch < 0x10000) {
7184 chars[0] = (wchar_t)ch;
7185 charsize = 1;
7186 }
7187 else {
7188 ch -= 0x10000;
7189 chars[0] = 0xd800 + (ch >> 10);
7190 chars[1] = 0xdc00 + (ch & 0x3ff);
7191 charsize = 2;
7192 }
7193
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007195 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007196 buffer, Py_ARRAY_LENGTH(buffer),
7197 NULL, pusedDefaultChar);
7198 if (outsize > 0) {
7199 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7200 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007201 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 memcpy(out, buffer, outsize);
7203 out += outsize;
7204 continue;
7205 }
7206 }
7207 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7208 PyErr_SetFromWindowsErr(0);
7209 goto error;
7210 }
7211
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 rep = unicode_encode_call_errorhandler(
7213 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007214 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007215 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 if (rep == NULL)
7217 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007218 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007219
7220 if (PyBytes_Check(rep)) {
7221 outsize = PyBytes_GET_SIZE(rep);
7222 if (outsize != 1) {
7223 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7224 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7225 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7226 Py_DECREF(rep);
7227 goto error;
7228 }
7229 out = PyBytes_AS_STRING(*outbytes) + offset;
7230 }
7231 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7232 out += outsize;
7233 }
7234 else {
7235 Py_ssize_t i;
7236 enum PyUnicode_Kind kind;
7237 void *data;
7238
Benjamin Petersonbac79492012-01-14 13:34:47 -05007239 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 Py_DECREF(rep);
7241 goto error;
7242 }
7243
7244 outsize = PyUnicode_GET_LENGTH(rep);
7245 if (outsize != 1) {
7246 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7247 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7248 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7249 Py_DECREF(rep);
7250 goto error;
7251 }
7252 out = PyBytes_AS_STRING(*outbytes) + offset;
7253 }
7254 kind = PyUnicode_KIND(rep);
7255 data = PyUnicode_DATA(rep);
7256 for (i=0; i < outsize; i++) {
7257 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7258 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007259 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007260 encoding, unicode,
7261 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007262 "unable to encode error handler result to ASCII");
7263 Py_DECREF(rep);
7264 goto error;
7265 }
7266 *out = (unsigned char)ch;
7267 out++;
7268 }
7269 }
7270 Py_DECREF(rep);
7271 }
7272 /* write a NUL byte */
7273 *out = 0;
7274 outsize = out - PyBytes_AS_STRING(*outbytes);
7275 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7276 if (_PyBytes_Resize(outbytes, outsize) < 0)
7277 goto error;
7278 ret = 0;
7279
7280error:
7281 Py_XDECREF(encoding_obj);
7282 Py_XDECREF(errorHandler);
7283 Py_XDECREF(exc);
7284 return ret;
7285}
7286
Victor Stinner3a50e702011-10-18 21:21:00 +02007287static PyObject *
7288encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007289 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007290 const char *errors)
7291{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007292 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007293 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007294 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007295 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007296
Benjamin Petersonbac79492012-01-14 13:34:47 -05007297 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007298 return NULL;
7299 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007300
Victor Stinner3a50e702011-10-18 21:21:00 +02007301 if (code_page < 0) {
7302 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7303 return NULL;
7304 }
7305
Martin v. Löwis3d325192011-11-04 18:23:06 +01007306 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007307 return PyBytes_FromStringAndSize(NULL, 0);
7308
Victor Stinner7581cef2011-11-03 22:32:33 +01007309 offset = 0;
7310 do
7311 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007312#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007313 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007314 chunks. */
7315 if (len > INT_MAX/2) {
7316 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007317 done = 0;
7318 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007319 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007321 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007322 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007323 done = 1;
7324 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007325
Victor Stinner76a31a62011-11-04 00:05:13 +01007326 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007327 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007328 errors);
7329 if (ret == -2)
7330 ret = encode_code_page_errors(code_page, &outbytes,
7331 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007332 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007333 if (ret < 0) {
7334 Py_XDECREF(outbytes);
7335 return NULL;
7336 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337
Victor Stinner7581cef2011-11-03 22:32:33 +01007338 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007339 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007340 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007341
Victor Stinner3a50e702011-10-18 21:21:00 +02007342 return outbytes;
7343}
7344
7345PyObject *
7346PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7347 Py_ssize_t size,
7348 const char *errors)
7349{
Victor Stinner7581cef2011-11-03 22:32:33 +01007350 PyObject *unicode, *res;
7351 unicode = PyUnicode_FromUnicode(p, size);
7352 if (unicode == NULL)
7353 return NULL;
7354 res = encode_code_page(CP_ACP, unicode, errors);
7355 Py_DECREF(unicode);
7356 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007357}
7358
7359PyObject *
7360PyUnicode_EncodeCodePage(int code_page,
7361 PyObject *unicode,
7362 const char *errors)
7363{
Victor Stinner7581cef2011-11-03 22:32:33 +01007364 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007365}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007366
Alexander Belopolsky40018472011-02-26 01:02:56 +00007367PyObject *
7368PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007369{
7370 if (!PyUnicode_Check(unicode)) {
7371 PyErr_BadArgument();
7372 return NULL;
7373 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007374 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007375}
7376
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007377#undef NEED_RETRY
7378
Victor Stinner99b95382011-07-04 14:23:54 +02007379#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007380
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381/* --- Character Mapping Codec -------------------------------------------- */
7382
Alexander Belopolsky40018472011-02-26 01:02:56 +00007383PyObject *
7384PyUnicode_DecodeCharmap(const char *s,
7385 Py_ssize_t size,
7386 PyObject *mapping,
7387 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007389 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007390 Py_ssize_t startinpos;
7391 Py_ssize_t endinpos;
7392 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007393 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007394 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007395 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007396 PyObject *errorHandler = NULL;
7397 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007398
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399 /* Default to Latin-1 */
7400 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007403 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007407 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007408 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007409 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007410 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007411 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007412 enum PyUnicode_Kind mapkind;
7413 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007414 Py_UCS4 x;
7415
Benjamin Petersonbac79492012-01-14 13:34:47 -05007416 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007417 return NULL;
7418
7419 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007420 mapdata = PyUnicode_DATA(mapping);
7421 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007423 unsigned char ch;
7424 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7425 enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7426 if (outkind == PyUnicode_1BYTE_KIND) {
7427 void *outdata = PyUnicode_DATA(v);
7428 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7429 while (s < e) {
7430 unsigned char ch = *s;
7431 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7432 if (x > maxchar)
7433 goto Error;
7434 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7435 ++s;
7436 }
7437 break;
7438 }
7439 else if (outkind == PyUnicode_2BYTE_KIND) {
7440 void *outdata = PyUnicode_DATA(v);
7441 while (s < e) {
7442 unsigned char ch = *s;
7443 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7444 if (x == 0xFFFE)
7445 goto Error;
7446 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7447 ++s;
7448 }
7449 break;
7450 }
7451 }
7452 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007455 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007456 else
7457 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007458Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007459 if (x == 0xfffe)
7460 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 startinpos = s-starts;
7463 endinpos = startinpos+1;
7464 if (unicode_decode_call_errorhandler(
7465 errors, &errorHandler,
7466 "charmap", "character maps to <undefined>",
7467 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007468 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 goto onError;
7470 }
7471 continue;
7472 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007473
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007474 if (unicode_putchar(&v, &outpos, x) < 0)
7475 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007477 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007478 }
7479 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 while (s < e) {
7481 unsigned char ch = *s;
7482 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007483
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7485 w = PyLong_FromLong((long)ch);
7486 if (w == NULL)
7487 goto onError;
7488 x = PyObject_GetItem(mapping, w);
7489 Py_DECREF(w);
7490 if (x == NULL) {
7491 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7492 /* No mapping found means: mapping is undefined. */
7493 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007494 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 } else
7496 goto onError;
7497 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007498
Benjamin Peterson29060642009-01-31 22:14:21 +00007499 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007500 if (x == Py_None)
7501 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 if (PyLong_Check(x)) {
7503 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007504 if (value == 0xFFFE)
7505 goto Undefined;
Antoine Pitroua1f76552012-09-23 20:00:04 +02007506 if (value < 0 || value > MAX_UNICODE) {
7507 PyErr_Format(PyExc_TypeError,
7508 "character mapping must be in range(0x%lx)",
7509 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 Py_DECREF(x);
7511 goto onError;
7512 }
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007513 if (unicode_putchar(&v, &outpos, value) < 0) {
7514 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007515 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007516 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007519 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007520
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007521 if (PyUnicode_READY(x) == -1) {
7522 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007523 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007524 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007525 targetsize = PyUnicode_GET_LENGTH(x);
7526
7527 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 /* 1-1 mapping */
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007529 Py_UCS4 value = PyUnicode_READ_CHAR(x, 0);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007530 if (value == 0xFFFE)
7531 goto Undefined;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007532 if (unicode_putchar(&v, &outpos, value) < 0) {
7533 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007534 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007535 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007536 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007537 else if (targetsize > 1) {
7538 /* 1-n mapping */
7539 if (targetsize > extrachars) {
7540 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007541 Py_ssize_t needed = (targetsize - extrachars) + \
7542 (targetsize << 2);
7543 extrachars += needed;
7544 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007545 if (unicode_resize(&v,
7546 PyUnicode_GET_LENGTH(v) + needed) < 0)
7547 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007548 Py_DECREF(x);
7549 goto onError;
7550 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 }
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007552 if (unicode_widen(&v, outpos,
7553 PyUnicode_MAX_CHAR_VALUE(x)) < 0) {
7554 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007555 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007556 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007557 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7558 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007559 extrachars -= targetsize;
7560 }
7561 /* 1-0 mapping: skip the character */
7562 }
7563 else {
7564 /* wrong return value */
7565 PyErr_SetString(PyExc_TypeError,
7566 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007567 Py_DECREF(x);
7568 goto onError;
7569 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 Py_DECREF(x);
7571 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007572 continue;
7573Undefined:
7574 /* undefined mapping */
7575 Py_XDECREF(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007576 startinpos = s-starts;
7577 endinpos = startinpos+1;
7578 if (unicode_decode_call_errorhandler(
7579 errors, &errorHandler,
7580 "charmap", "character maps to <undefined>",
7581 &starts, &e, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007582 &v, &outpos)) {
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007583 goto onError;
7584 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007587 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007588 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007589 Py_XDECREF(errorHandler);
7590 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007591 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007592
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007594 Py_XDECREF(errorHandler);
7595 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596 Py_XDECREF(v);
7597 return NULL;
7598}
7599
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007600/* Charmap encoding: the lookup table */
7601
Alexander Belopolsky40018472011-02-26 01:02:56 +00007602struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 PyObject_HEAD
7604 unsigned char level1[32];
7605 int count2, count3;
7606 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007607};
7608
7609static PyObject*
7610encoding_map_size(PyObject *obj, PyObject* args)
7611{
7612 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007613 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007615}
7616
7617static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007618 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 PyDoc_STR("Return the size (in bytes) of this object") },
7620 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007621};
7622
7623static void
7624encoding_map_dealloc(PyObject* o)
7625{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007626 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007627}
7628
7629static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007630 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 "EncodingMap", /*tp_name*/
7632 sizeof(struct encoding_map), /*tp_basicsize*/
7633 0, /*tp_itemsize*/
7634 /* methods */
7635 encoding_map_dealloc, /*tp_dealloc*/
7636 0, /*tp_print*/
7637 0, /*tp_getattr*/
7638 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007639 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 0, /*tp_repr*/
7641 0, /*tp_as_number*/
7642 0, /*tp_as_sequence*/
7643 0, /*tp_as_mapping*/
7644 0, /*tp_hash*/
7645 0, /*tp_call*/
7646 0, /*tp_str*/
7647 0, /*tp_getattro*/
7648 0, /*tp_setattro*/
7649 0, /*tp_as_buffer*/
7650 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7651 0, /*tp_doc*/
7652 0, /*tp_traverse*/
7653 0, /*tp_clear*/
7654 0, /*tp_richcompare*/
7655 0, /*tp_weaklistoffset*/
7656 0, /*tp_iter*/
7657 0, /*tp_iternext*/
7658 encoding_map_methods, /*tp_methods*/
7659 0, /*tp_members*/
7660 0, /*tp_getset*/
7661 0, /*tp_base*/
7662 0, /*tp_dict*/
7663 0, /*tp_descr_get*/
7664 0, /*tp_descr_set*/
7665 0, /*tp_dictoffset*/
7666 0, /*tp_init*/
7667 0, /*tp_alloc*/
7668 0, /*tp_new*/
7669 0, /*tp_free*/
7670 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007671};
7672
7673PyObject*
7674PyUnicode_BuildEncodingMap(PyObject* string)
7675{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007676 PyObject *result;
7677 struct encoding_map *mresult;
7678 int i;
7679 int need_dict = 0;
7680 unsigned char level1[32];
7681 unsigned char level2[512];
7682 unsigned char *mlevel1, *mlevel2, *mlevel3;
7683 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007684 int kind;
7685 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007686 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007687 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007688
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007689 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007690 PyErr_BadArgument();
7691 return NULL;
7692 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007693 kind = PyUnicode_KIND(string);
7694 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007695 length = PyUnicode_GET_LENGTH(string);
7696 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007697 memset(level1, 0xFF, sizeof level1);
7698 memset(level2, 0xFF, sizeof level2);
7699
7700 /* If there isn't a one-to-one mapping of NULL to \0,
7701 or if there are non-BMP characters, we need to use
7702 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007703 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007704 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007705 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007706 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007707 ch = PyUnicode_READ(kind, data, i);
7708 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007709 need_dict = 1;
7710 break;
7711 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007712 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007713 /* unmapped character */
7714 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007715 l1 = ch >> 11;
7716 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007717 if (level1[l1] == 0xFF)
7718 level1[l1] = count2++;
7719 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007720 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007721 }
7722
7723 if (count2 >= 0xFF || count3 >= 0xFF)
7724 need_dict = 1;
7725
7726 if (need_dict) {
7727 PyObject *result = PyDict_New();
7728 PyObject *key, *value;
7729 if (!result)
7730 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007731 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007732 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007733 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007734 if (!key || !value)
7735 goto failed1;
7736 if (PyDict_SetItem(result, key, value) == -1)
7737 goto failed1;
7738 Py_DECREF(key);
7739 Py_DECREF(value);
7740 }
7741 return result;
7742 failed1:
7743 Py_XDECREF(key);
7744 Py_XDECREF(value);
7745 Py_DECREF(result);
7746 return NULL;
7747 }
7748
7749 /* Create a three-level trie */
7750 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7751 16*count2 + 128*count3 - 1);
7752 if (!result)
7753 return PyErr_NoMemory();
7754 PyObject_Init(result, &EncodingMapType);
7755 mresult = (struct encoding_map*)result;
7756 mresult->count2 = count2;
7757 mresult->count3 = count3;
7758 mlevel1 = mresult->level1;
7759 mlevel2 = mresult->level23;
7760 mlevel3 = mresult->level23 + 16*count2;
7761 memcpy(mlevel1, level1, 32);
7762 memset(mlevel2, 0xFF, 16*count2);
7763 memset(mlevel3, 0, 128*count3);
7764 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007765 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007766 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007767 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7768 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007769 /* unmapped character */
7770 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007771 o1 = ch>>11;
7772 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007773 i2 = 16*mlevel1[o1] + o2;
7774 if (mlevel2[i2] == 0xFF)
7775 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007776 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007777 i3 = 128*mlevel2[i2] + o3;
7778 mlevel3[i3] = i;
7779 }
7780 return result;
7781}
7782
7783static int
Victor Stinner22168992011-11-20 17:09:18 +01007784encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007785{
7786 struct encoding_map *map = (struct encoding_map*)mapping;
7787 int l1 = c>>11;
7788 int l2 = (c>>7) & 0xF;
7789 int l3 = c & 0x7F;
7790 int i;
7791
Victor Stinner22168992011-11-20 17:09:18 +01007792 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007794 if (c == 0)
7795 return 0;
7796 /* level 1*/
7797 i = map->level1[l1];
7798 if (i == 0xFF) {
7799 return -1;
7800 }
7801 /* level 2*/
7802 i = map->level23[16*i+l2];
7803 if (i == 0xFF) {
7804 return -1;
7805 }
7806 /* level 3 */
7807 i = map->level23[16*map->count2 + 128*i + l3];
7808 if (i == 0) {
7809 return -1;
7810 }
7811 return i;
7812}
7813
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007814/* Lookup the character ch in the mapping. If the character
7815 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007816 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007817static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007818charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819{
Christian Heimes217cfd12007-12-02 14:31:20 +00007820 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007821 PyObject *x;
7822
7823 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007825 x = PyObject_GetItem(mapping, w);
7826 Py_DECREF(w);
7827 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7829 /* No mapping found means: mapping is undefined. */
7830 PyErr_Clear();
7831 x = Py_None;
7832 Py_INCREF(x);
7833 return x;
7834 } else
7835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007837 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007839 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 long value = PyLong_AS_LONG(x);
7841 if (value < 0 || value > 255) {
7842 PyErr_SetString(PyExc_TypeError,
7843 "character mapping must be in range(256)");
7844 Py_DECREF(x);
7845 return NULL;
7846 }
7847 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007849 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 /* wrong return value */
7853 PyErr_Format(PyExc_TypeError,
7854 "character mapping must return integer, bytes or None, not %.400s",
7855 x->ob_type->tp_name);
7856 Py_DECREF(x);
7857 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858 }
7859}
7860
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007861static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007862charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007863{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007864 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7865 /* exponentially overallocate to minimize reallocations */
7866 if (requiredsize < 2*outsize)
7867 requiredsize = 2*outsize;
7868 if (_PyBytes_Resize(outobj, requiredsize))
7869 return -1;
7870 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007871}
7872
Benjamin Peterson14339b62009-01-31 16:36:08 +00007873typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007875} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007876/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007877 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007878 space is available. Return a new reference to the object that
7879 was put in the output buffer, or Py_None, if the mapping was undefined
7880 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007881 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007882static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007883charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007884 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007885{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007886 PyObject *rep;
7887 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007888 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007889
Christian Heimes90aa7642007-12-19 02:45:37 +00007890 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007891 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007893 if (res == -1)
7894 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 if (outsize<requiredsize)
7896 if (charmapencode_resize(outobj, outpos, requiredsize))
7897 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007898 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 outstart[(*outpos)++] = (char)res;
7900 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007901 }
7902
7903 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007904 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007906 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 Py_DECREF(rep);
7908 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007909 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 if (PyLong_Check(rep)) {
7911 Py_ssize_t requiredsize = *outpos+1;
7912 if (outsize<requiredsize)
7913 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7914 Py_DECREF(rep);
7915 return enc_EXCEPTION;
7916 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007917 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007919 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 else {
7921 const char *repchars = PyBytes_AS_STRING(rep);
7922 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7923 Py_ssize_t requiredsize = *outpos+repsize;
7924 if (outsize<requiredsize)
7925 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7926 Py_DECREF(rep);
7927 return enc_EXCEPTION;
7928 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007929 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 memcpy(outstart + *outpos, repchars, repsize);
7931 *outpos += repsize;
7932 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007933 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007934 Py_DECREF(rep);
7935 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007936}
7937
7938/* handle an error in PyUnicode_EncodeCharmap
7939 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007940static int
7941charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007942 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007943 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007944 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007945 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007946{
7947 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007948 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007949 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007950 enum PyUnicode_Kind kind;
7951 void *data;
7952 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007953 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007954 Py_ssize_t collstartpos = *inpos;
7955 Py_ssize_t collendpos = *inpos+1;
7956 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007957 char *encoding = "charmap";
7958 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007959 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007960 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007961 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007962
Benjamin Petersonbac79492012-01-14 13:34:47 -05007963 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007964 return -1;
7965 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007966 /* find all unencodable characters */
7967 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007968 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007969 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007970 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007971 val = encoding_map_lookup(ch, mapping);
7972 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 break;
7974 ++collendpos;
7975 continue;
7976 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007977
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007978 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7979 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 if (rep==NULL)
7981 return -1;
7982 else if (rep!=Py_None) {
7983 Py_DECREF(rep);
7984 break;
7985 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007986 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007988 }
7989 /* cache callback name lookup
7990 * (if not done yet, i.e. it's the first error) */
7991 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 if ((errors==NULL) || (!strcmp(errors, "strict")))
7993 *known_errorHandler = 1;
7994 else if (!strcmp(errors, "replace"))
7995 *known_errorHandler = 2;
7996 else if (!strcmp(errors, "ignore"))
7997 *known_errorHandler = 3;
7998 else if (!strcmp(errors, "xmlcharrefreplace"))
7999 *known_errorHandler = 4;
8000 else
8001 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008002 }
8003 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008004 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008005 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008006 return -1;
8007 case 2: /* replace */
8008 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 x = charmapencode_output('?', mapping, res, respos);
8010 if (x==enc_EXCEPTION) {
8011 return -1;
8012 }
8013 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008014 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 return -1;
8016 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008017 }
8018 /* fall through */
8019 case 3: /* ignore */
8020 *inpos = collendpos;
8021 break;
8022 case 4: /* xmlcharrefreplace */
8023 /* generate replacement (temporarily (mis)uses p) */
8024 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 char buffer[2+29+1+1];
8026 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008027 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 for (cp = buffer; *cp; ++cp) {
8029 x = charmapencode_output(*cp, mapping, res, respos);
8030 if (x==enc_EXCEPTION)
8031 return -1;
8032 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008033 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 return -1;
8035 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008036 }
8037 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008038 *inpos = collendpos;
8039 break;
8040 default:
8041 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008042 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008044 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008046 if (PyBytes_Check(repunicode)) {
8047 /* Directly copy bytes result to output. */
8048 Py_ssize_t outsize = PyBytes_Size(*res);
8049 Py_ssize_t requiredsize;
8050 repsize = PyBytes_Size(repunicode);
8051 requiredsize = *respos + repsize;
8052 if (requiredsize > outsize)
8053 /* Make room for all additional bytes. */
8054 if (charmapencode_resize(res, respos, requiredsize)) {
8055 Py_DECREF(repunicode);
8056 return -1;
8057 }
8058 memcpy(PyBytes_AsString(*res) + *respos,
8059 PyBytes_AsString(repunicode), repsize);
8060 *respos += repsize;
8061 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008062 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008063 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008064 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008065 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008066 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008067 Py_DECREF(repunicode);
8068 return -1;
8069 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008070 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008071 data = PyUnicode_DATA(repunicode);
8072 kind = PyUnicode_KIND(repunicode);
8073 for (index = 0; index < repsize; index++) {
8074 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8075 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008076 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008077 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 return -1;
8079 }
8080 else if (x==enc_FAILED) {
8081 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008082 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 return -1;
8084 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008085 }
8086 *inpos = newpos;
8087 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088 }
8089 return 0;
8090}
8091
Alexander Belopolsky40018472011-02-26 01:02:56 +00008092PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008093_PyUnicode_EncodeCharmap(PyObject *unicode,
8094 PyObject *mapping,
8095 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097 /* output object */
8098 PyObject *res = NULL;
8099 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008100 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008101 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008102 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008103 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104 PyObject *errorHandler = NULL;
8105 PyObject *exc = NULL;
8106 /* the following variable is used for caching string comparisons
8107 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8108 * 3=ignore, 4=xmlcharrefreplace */
8109 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110
Benjamin Petersonbac79492012-01-14 13:34:47 -05008111 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008112 return NULL;
8113 size = PyUnicode_GET_LENGTH(unicode);
8114
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115 /* Default to Latin-1 */
8116 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008117 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008119 /* allocate enough for a simple encoding without
8120 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008121 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008122 if (res == NULL)
8123 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008124 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008127 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008128 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008130 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 if (x==enc_EXCEPTION) /* error */
8132 goto onError;
8133 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008134 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 &exc,
8136 &known_errorHandler, &errorHandler, errors,
8137 &res, &respos)) {
8138 goto onError;
8139 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008140 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 else
8142 /* done with this character => adjust input position */
8143 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008146 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008147 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008148 if (_PyBytes_Resize(&res, respos) < 0)
8149 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008151 Py_XDECREF(exc);
8152 Py_XDECREF(errorHandler);
8153 return res;
8154
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008156 Py_XDECREF(res);
8157 Py_XDECREF(exc);
8158 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159 return NULL;
8160}
8161
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008162/* Deprecated */
8163PyObject *
8164PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8165 Py_ssize_t size,
8166 PyObject *mapping,
8167 const char *errors)
8168{
8169 PyObject *result;
8170 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8171 if (unicode == NULL)
8172 return NULL;
8173 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8174 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008175 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008176}
8177
Alexander Belopolsky40018472011-02-26 01:02:56 +00008178PyObject *
8179PyUnicode_AsCharmapString(PyObject *unicode,
8180 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181{
8182 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 PyErr_BadArgument();
8184 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008186 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187}
8188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008189/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008190static void
8191make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008192 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008193 Py_ssize_t startpos, Py_ssize_t endpos,
8194 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008196 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008197 *exceptionObject = _PyUnicodeTranslateError_Create(
8198 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199 }
8200 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8202 goto onError;
8203 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8204 goto onError;
8205 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8206 goto onError;
8207 return;
8208 onError:
8209 Py_DECREF(*exceptionObject);
8210 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211 }
8212}
8213
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008214/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008215static void
8216raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008217 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008218 Py_ssize_t startpos, Py_ssize_t endpos,
8219 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008220{
8221 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008222 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008223 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008225}
8226
8227/* error handling callback helper:
8228 build arguments, call the callback and check the arguments,
8229 put the result into newpos and return the replacement string, which
8230 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008231static PyObject *
8232unicode_translate_call_errorhandler(const char *errors,
8233 PyObject **errorHandler,
8234 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008236 Py_ssize_t startpos, Py_ssize_t endpos,
8237 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008238{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008239 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008241 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008242 PyObject *restuple;
8243 PyObject *resunicode;
8244
8245 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008247 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249 }
8250
8251 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008252 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008253 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008255
8256 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008261 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 Py_DECREF(restuple);
8263 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 }
8265 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 &resunicode, &i_newpos)) {
8267 Py_DECREF(restuple);
8268 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008270 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008271 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008272 else
8273 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008274 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8276 Py_DECREF(restuple);
8277 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008278 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008279 Py_INCREF(resunicode);
8280 Py_DECREF(restuple);
8281 return resunicode;
8282}
8283
8284/* Lookup the character ch in the mapping and put the result in result,
8285 which must be decrefed by the caller.
8286 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008287static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008288charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008289{
Christian Heimes217cfd12007-12-02 14:31:20 +00008290 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008291 PyObject *x;
8292
8293 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008295 x = PyObject_GetItem(mapping, w);
8296 Py_DECREF(w);
8297 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8299 /* No mapping found means: use 1:1 mapping. */
8300 PyErr_Clear();
8301 *result = NULL;
8302 return 0;
8303 } else
8304 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 }
8306 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 *result = x;
8308 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008309 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008310 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 long value = PyLong_AS_LONG(x);
8312 long max = PyUnicode_GetMax();
8313 if (value < 0 || value > max) {
8314 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008315 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 Py_DECREF(x);
8317 return -1;
8318 }
8319 *result = x;
8320 return 0;
8321 }
8322 else if (PyUnicode_Check(x)) {
8323 *result = x;
8324 return 0;
8325 }
8326 else {
8327 /* wrong return value */
8328 PyErr_SetString(PyExc_TypeError,
8329 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008330 Py_DECREF(x);
8331 return -1;
8332 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333}
8334/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 if not reallocate and adjust various state variables.
8336 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008337static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008338charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008340{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008341 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008342 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008343 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 /* exponentially overallocate to minimize reallocations */
8345 if (requiredsize < 2 * oldsize)
8346 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008347 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8348 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008350 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352 }
8353 return 0;
8354}
8355/* lookup the character, put the result in the output string and adjust
8356 various state variables. Return a new reference to the object that
8357 was put in the output buffer in *result, or Py_None, if the mapping was
8358 undefined (in which case no character was written).
8359 The called must decref result.
8360 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008361static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8363 PyObject *mapping, Py_UCS4 **output,
8364 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008365 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008367 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8368 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008373 }
8374 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008376 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008378 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379 }
8380 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 Py_ssize_t repsize;
8382 if (PyUnicode_READY(*res) == -1)
8383 return -1;
8384 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 if (repsize==1) {
8386 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 }
8389 else if (repsize!=0) {
8390 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 Py_ssize_t requiredsize = *opos +
8392 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008394 Py_ssize_t i;
8395 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008397 for(i = 0; i < repsize; i++)
8398 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400 }
8401 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403 return 0;
8404}
8405
Alexander Belopolsky40018472011-02-26 01:02:56 +00008406PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008407_PyUnicode_TranslateCharmap(PyObject *input,
8408 PyObject *mapping,
8409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008411 /* input object */
8412 char *idata;
8413 Py_ssize_t size, i;
8414 int kind;
8415 /* output buffer */
8416 Py_UCS4 *output = NULL;
8417 Py_ssize_t osize;
8418 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008420 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421 char *reason = "character maps to <undefined>";
8422 PyObject *errorHandler = NULL;
8423 PyObject *exc = NULL;
8424 /* the following variable is used for caching string comparisons
8425 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8426 * 3=ignore, 4=xmlcharrefreplace */
8427 int known_errorHandler = -1;
8428
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 PyErr_BadArgument();
8431 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008434 if (PyUnicode_READY(input) == -1)
8435 return NULL;
8436 idata = (char*)PyUnicode_DATA(input);
8437 kind = PyUnicode_KIND(input);
8438 size = PyUnicode_GET_LENGTH(input);
8439 i = 0;
8440
8441 if (size == 0) {
8442 Py_INCREF(input);
8443 return input;
8444 }
8445
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008446 /* allocate enough for a simple 1:1 translation without
8447 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008448 osize = size;
8449 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8450 opos = 0;
8451 if (output == NULL) {
8452 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008456 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 /* try to encode it */
8458 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008459 if (charmaptranslate_output(input, i, mapping,
8460 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 Py_XDECREF(x);
8462 goto onError;
8463 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008464 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008466 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 else { /* untranslatable character */
8468 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8469 Py_ssize_t repsize;
8470 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473 Py_ssize_t collstart = i;
8474 Py_ssize_t collend = i+1;
8475 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008478 while (collend < size) {
8479 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 goto onError;
8481 Py_XDECREF(x);
8482 if (x!=Py_None)
8483 break;
8484 ++collend;
8485 }
8486 /* cache callback name lookup
8487 * (if not done yet, i.e. it's the first error) */
8488 if (known_errorHandler==-1) {
8489 if ((errors==NULL) || (!strcmp(errors, "strict")))
8490 known_errorHandler = 1;
8491 else if (!strcmp(errors, "replace"))
8492 known_errorHandler = 2;
8493 else if (!strcmp(errors, "ignore"))
8494 known_errorHandler = 3;
8495 else if (!strcmp(errors, "xmlcharrefreplace"))
8496 known_errorHandler = 4;
8497 else
8498 known_errorHandler = 0;
8499 }
8500 switch (known_errorHandler) {
8501 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 raise_translate_exception(&exc, input, collstart,
8503 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008504 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 case 2: /* replace */
8506 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 for (coll = collstart; coll<collend; coll++)
8508 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 /* fall through */
8510 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008511 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 break;
8513 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 /* generate replacement (temporarily (mis)uses i) */
8515 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 char buffer[2+29+1+1];
8517 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008518 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8519 if (charmaptranslate_makespace(&output, &osize,
8520 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 goto onError;
8522 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008523 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008525 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 break;
8527 default:
8528 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529 reason, input, &exc,
8530 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008531 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008533 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008534 Py_DECREF(repunicode);
8535 goto onError;
8536 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 repsize = PyUnicode_GET_LENGTH(repunicode);
8539 if (charmaptranslate_makespace(&output, &osize,
8540 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 Py_DECREF(repunicode);
8542 goto onError;
8543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544 for (uni2 = 0; repsize-->0; ++uni2)
8545 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8546 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008548 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008549 }
8550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8552 if (!res)
8553 goto onError;
8554 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008555 Py_XDECREF(exc);
8556 Py_XDECREF(errorHandler);
8557 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 Py_XDECREF(exc);
8562 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 return NULL;
8564}
8565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566/* Deprecated. Use PyUnicode_Translate instead. */
8567PyObject *
8568PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8569 Py_ssize_t size,
8570 PyObject *mapping,
8571 const char *errors)
8572{
Christian Heimes5f520f42012-09-11 14:03:25 +02008573 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008574 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8575 if (!unicode)
8576 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008577 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8578 Py_DECREF(unicode);
8579 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008580}
8581
Alexander Belopolsky40018472011-02-26 01:02:56 +00008582PyObject *
8583PyUnicode_Translate(PyObject *str,
8584 PyObject *mapping,
8585 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586{
8587 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008588
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589 str = PyUnicode_FromObject(str);
8590 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008591 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593 Py_DECREF(str);
8594 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595}
Tim Petersced69f82003-09-16 20:30:58 +00008596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008597static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008598fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599{
8600 /* No need to call PyUnicode_READY(self) because this function is only
8601 called as a callback from fixup() which does it already. */
8602 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8603 const int kind = PyUnicode_KIND(self);
8604 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008605 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008606 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 Py_ssize_t i;
8608
8609 for (i = 0; i < len; ++i) {
8610 ch = PyUnicode_READ(kind, data, i);
8611 fixed = 0;
8612 if (ch > 127) {
8613 if (Py_UNICODE_ISSPACE(ch))
8614 fixed = ' ';
8615 else {
8616 const int decimal = Py_UNICODE_TODECIMAL(ch);
8617 if (decimal >= 0)
8618 fixed = '0' + decimal;
8619 }
8620 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008621 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008622 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 PyUnicode_WRITE(kind, data, i, fixed);
8624 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008625 else
8626 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 }
8629
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008630 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631}
8632
8633PyObject *
8634_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8635{
8636 if (!PyUnicode_Check(unicode)) {
8637 PyErr_BadInternalCall();
8638 return NULL;
8639 }
8640 if (PyUnicode_READY(unicode) == -1)
8641 return NULL;
8642 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8643 /* If the string is already ASCII, just return the same string */
8644 Py_INCREF(unicode);
8645 return unicode;
8646 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008647 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648}
8649
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008650PyObject *
8651PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8652 Py_ssize_t length)
8653{
Victor Stinnerf0124502011-11-21 23:12:56 +01008654 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008655 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008656 Py_UCS4 maxchar;
8657 enum PyUnicode_Kind kind;
8658 void *data;
8659
Victor Stinner99d7ad02012-02-22 13:37:39 +01008660 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008661 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008662 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008663 if (ch > 127) {
8664 int decimal = Py_UNICODE_TODECIMAL(ch);
8665 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008666 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008667 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008668 }
8669 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008670
8671 /* Copy to a new string */
8672 decimal = PyUnicode_New(length, maxchar);
8673 if (decimal == NULL)
8674 return decimal;
8675 kind = PyUnicode_KIND(decimal);
8676 data = PyUnicode_DATA(decimal);
8677 /* Iterate over code points */
8678 for (i = 0; i < length; i++) {
8679 Py_UNICODE ch = s[i];
8680 if (ch > 127) {
8681 int decimal = Py_UNICODE_TODECIMAL(ch);
8682 if (decimal >= 0)
8683 ch = '0' + decimal;
8684 }
8685 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008687 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008688}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008689/* --- Decimal Encoder ---------------------------------------------------- */
8690
Alexander Belopolsky40018472011-02-26 01:02:56 +00008691int
8692PyUnicode_EncodeDecimal(Py_UNICODE *s,
8693 Py_ssize_t length,
8694 char *output,
8695 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008696{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008697 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008698 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008699 enum PyUnicode_Kind kind;
8700 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008701
8702 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 PyErr_BadArgument();
8704 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008705 }
8706
Victor Stinner42bf7752011-11-21 22:52:58 +01008707 unicode = PyUnicode_FromUnicode(s, length);
8708 if (unicode == NULL)
8709 return -1;
8710
Benjamin Petersonbac79492012-01-14 13:34:47 -05008711 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008712 Py_DECREF(unicode);
8713 return -1;
8714 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008715 kind = PyUnicode_KIND(unicode);
8716 data = PyUnicode_DATA(unicode);
8717
Victor Stinnerb84d7232011-11-22 01:50:07 +01008718 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008719 PyObject *exc;
8720 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008722 Py_ssize_t startpos;
8723
8724 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008725
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008727 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008728 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008730 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 decimal = Py_UNICODE_TODECIMAL(ch);
8732 if (decimal >= 0) {
8733 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008734 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 continue;
8736 }
8737 if (0 < ch && ch < 256) {
8738 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008739 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 continue;
8741 }
Victor Stinner6345be92011-11-25 20:09:01 +01008742
Victor Stinner42bf7752011-11-21 22:52:58 +01008743 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008744 exc = NULL;
8745 raise_encode_exception(&exc, "decimal", unicode,
8746 startpos, startpos+1,
8747 "invalid decimal Unicode string");
8748 Py_XDECREF(exc);
8749 Py_DECREF(unicode);
8750 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008751 }
8752 /* 0-terminate the output string */
8753 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008754 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008755 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008756}
8757
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758/* --- Helpers ------------------------------------------------------------ */
8759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008760static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008761any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008762 Py_ssize_t start,
8763 Py_ssize_t end)
8764{
8765 int kind1, kind2, kind;
8766 void *buf1, *buf2;
8767 Py_ssize_t len1, len2, result;
8768
8769 kind1 = PyUnicode_KIND(s1);
8770 kind2 = PyUnicode_KIND(s2);
8771 kind = kind1 > kind2 ? kind1 : kind2;
8772 buf1 = PyUnicode_DATA(s1);
8773 buf2 = PyUnicode_DATA(s2);
8774 if (kind1 != kind)
8775 buf1 = _PyUnicode_AsKind(s1, kind);
8776 if (!buf1)
8777 return -2;
8778 if (kind2 != kind)
8779 buf2 = _PyUnicode_AsKind(s2, kind);
8780 if (!buf2) {
8781 if (kind1 != kind) PyMem_Free(buf1);
8782 return -2;
8783 }
8784 len1 = PyUnicode_GET_LENGTH(s1);
8785 len2 = PyUnicode_GET_LENGTH(s2);
8786
Victor Stinner794d5672011-10-10 03:21:36 +02008787 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008788 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008789 case PyUnicode_1BYTE_KIND:
8790 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8791 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8792 else
8793 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8794 break;
8795 case PyUnicode_2BYTE_KIND:
8796 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8797 break;
8798 case PyUnicode_4BYTE_KIND:
8799 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8800 break;
8801 default:
8802 assert(0); result = -2;
8803 }
8804 }
8805 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008806 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008807 case PyUnicode_1BYTE_KIND:
8808 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8809 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8810 else
8811 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8812 break;
8813 case PyUnicode_2BYTE_KIND:
8814 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8815 break;
8816 case PyUnicode_4BYTE_KIND:
8817 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8818 break;
8819 default:
8820 assert(0); result = -2;
8821 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822 }
8823
8824 if (kind1 != kind)
8825 PyMem_Free(buf1);
8826 if (kind2 != kind)
8827 PyMem_Free(buf2);
8828
8829 return result;
8830}
8831
8832Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008833_PyUnicode_InsertThousandsGrouping(
8834 PyObject *unicode, Py_ssize_t index,
8835 Py_ssize_t n_buffer,
8836 void *digits, Py_ssize_t n_digits,
8837 Py_ssize_t min_width,
8838 const char *grouping, PyObject *thousands_sep,
8839 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840{
Victor Stinner41a863c2012-02-24 00:37:51 +01008841 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008842 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008843 Py_ssize_t thousands_sep_len;
8844 Py_ssize_t len;
8845
8846 if (unicode != NULL) {
8847 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008848 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008849 }
8850 else {
8851 kind = PyUnicode_1BYTE_KIND;
8852 data = NULL;
8853 }
8854 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8855 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8856 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8857 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008858 if (thousands_sep_kind < kind) {
8859 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8860 if (!thousands_sep_data)
8861 return -1;
8862 }
8863 else {
8864 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8865 if (!data)
8866 return -1;
8867 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008868 }
8869
Benjamin Petersonead6b532011-12-20 17:23:42 -06008870 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008871 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008872 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008873 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008874 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008875 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008876 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008877 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008878 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008879 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008880 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008881 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008882 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008884 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008885 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008886 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008887 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008888 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008889 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008890 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008891 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008892 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008893 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008894 break;
8895 default:
8896 assert(0);
8897 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008899 if (unicode != NULL && thousands_sep_kind != kind) {
8900 if (thousands_sep_kind < kind)
8901 PyMem_Free(thousands_sep_data);
8902 else
8903 PyMem_Free(data);
8904 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008905 if (unicode == NULL) {
8906 *maxchar = 127;
8907 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008908 *maxchar = MAX_MAXCHAR(*maxchar,
8909 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008910 }
8911 }
8912 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913}
8914
8915
Thomas Wouters477c8d52006-05-27 19:21:47 +00008916/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008917#define ADJUST_INDICES(start, end, len) \
8918 if (end > len) \
8919 end = len; \
8920 else if (end < 0) { \
8921 end += len; \
8922 if (end < 0) \
8923 end = 0; \
8924 } \
8925 if (start < 0) { \
8926 start += len; \
8927 if (start < 0) \
8928 start = 0; \
8929 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008930
Alexander Belopolsky40018472011-02-26 01:02:56 +00008931Py_ssize_t
8932PyUnicode_Count(PyObject *str,
8933 PyObject *substr,
8934 Py_ssize_t start,
8935 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008937 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008938 PyObject* str_obj;
8939 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940 int kind1, kind2, kind;
8941 void *buf1 = NULL, *buf2 = NULL;
8942 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008943
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008944 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008945 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008947 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008948 if (!sub_obj) {
8949 Py_DECREF(str_obj);
8950 return -1;
8951 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008952 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008953 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 Py_DECREF(str_obj);
8955 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956 }
Tim Petersced69f82003-09-16 20:30:58 +00008957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 kind1 = PyUnicode_KIND(str_obj);
8959 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008960 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008963 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008964 if (kind2 > kind) {
8965 Py_DECREF(sub_obj);
8966 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008967 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008968 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008969 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008970 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971 if (!buf2)
8972 goto onError;
8973 len1 = PyUnicode_GET_LENGTH(str_obj);
8974 len2 = PyUnicode_GET_LENGTH(sub_obj);
8975
8976 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008977 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008979 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8980 result = asciilib_count(
8981 ((Py_UCS1*)buf1) + start, end - start,
8982 buf2, len2, PY_SSIZE_T_MAX
8983 );
8984 else
8985 result = ucs1lib_count(
8986 ((Py_UCS1*)buf1) + start, end - start,
8987 buf2, len2, PY_SSIZE_T_MAX
8988 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008989 break;
8990 case PyUnicode_2BYTE_KIND:
8991 result = ucs2lib_count(
8992 ((Py_UCS2*)buf1) + start, end - start,
8993 buf2, len2, PY_SSIZE_T_MAX
8994 );
8995 break;
8996 case PyUnicode_4BYTE_KIND:
8997 result = ucs4lib_count(
8998 ((Py_UCS4*)buf1) + start, end - start,
8999 buf2, len2, PY_SSIZE_T_MAX
9000 );
9001 break;
9002 default:
9003 assert(0); result = 0;
9004 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009005
9006 Py_DECREF(sub_obj);
9007 Py_DECREF(str_obj);
9008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 if (kind2 != kind)
9010 PyMem_Free(buf2);
9011
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 onError:
9014 Py_DECREF(sub_obj);
9015 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009016 if (kind2 != kind && buf2)
9017 PyMem_Free(buf2);
9018 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019}
9020
Alexander Belopolsky40018472011-02-26 01:02:56 +00009021Py_ssize_t
9022PyUnicode_Find(PyObject *str,
9023 PyObject *sub,
9024 Py_ssize_t start,
9025 Py_ssize_t end,
9026 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009028 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009029
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009031 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009033 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009034 if (!sub) {
9035 Py_DECREF(str);
9036 return -2;
9037 }
9038 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9039 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 Py_DECREF(str);
9041 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042 }
Tim Petersced69f82003-09-16 20:30:58 +00009043
Victor Stinner794d5672011-10-10 03:21:36 +02009044 result = any_find_slice(direction,
9045 str, sub, start, end
9046 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009047
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009049 Py_DECREF(sub);
9050
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051 return result;
9052}
9053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054Py_ssize_t
9055PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9056 Py_ssize_t start, Py_ssize_t end,
9057 int direction)
9058{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009060 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 if (PyUnicode_READY(str) == -1)
9062 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009063 if (start < 0 || end < 0) {
9064 PyErr_SetString(PyExc_IndexError, "string index out of range");
9065 return -2;
9066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067 if (end > PyUnicode_GET_LENGTH(str))
9068 end = PyUnicode_GET_LENGTH(str);
9069 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009070 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9071 kind, end-start, ch, direction);
9072 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009074 else
9075 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076}
9077
Alexander Belopolsky40018472011-02-26 01:02:56 +00009078static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009079tailmatch(PyObject *self,
9080 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009081 Py_ssize_t start,
9082 Py_ssize_t end,
9083 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 int kind_self;
9086 int kind_sub;
9087 void *data_self;
9088 void *data_sub;
9089 Py_ssize_t offset;
9090 Py_ssize_t i;
9091 Py_ssize_t end_sub;
9092
9093 if (PyUnicode_READY(self) == -1 ||
9094 PyUnicode_READY(substring) == -1)
9095 return 0;
9096
9097 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098 return 1;
9099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9101 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009103 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009105 kind_self = PyUnicode_KIND(self);
9106 data_self = PyUnicode_DATA(self);
9107 kind_sub = PyUnicode_KIND(substring);
9108 data_sub = PyUnicode_DATA(substring);
9109 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9110
9111 if (direction > 0)
9112 offset = end;
9113 else
9114 offset = start;
9115
9116 if (PyUnicode_READ(kind_self, data_self, offset) ==
9117 PyUnicode_READ(kind_sub, data_sub, 0) &&
9118 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9119 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9120 /* If both are of the same kind, memcmp is sufficient */
9121 if (kind_self == kind_sub) {
9122 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009123 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124 data_sub,
9125 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009126 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 }
9128 /* otherwise we have to compare each character by first accesing it */
9129 else {
9130 /* We do not need to compare 0 and len(substring)-1 because
9131 the if statement above ensured already that they are equal
9132 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02009133 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009134 for (i = 1; i < end_sub; ++i) {
9135 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9136 PyUnicode_READ(kind_sub, data_sub, i))
9137 return 0;
9138 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009140 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141 }
9142
9143 return 0;
9144}
9145
Alexander Belopolsky40018472011-02-26 01:02:56 +00009146Py_ssize_t
9147PyUnicode_Tailmatch(PyObject *str,
9148 PyObject *substr,
9149 Py_ssize_t start,
9150 Py_ssize_t end,
9151 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009153 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009154
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155 str = PyUnicode_FromObject(str);
9156 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009157 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158 substr = PyUnicode_FromObject(substr);
9159 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009160 Py_DECREF(str);
9161 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009162 }
Tim Petersced69f82003-09-16 20:30:58 +00009163
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009164 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009165 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009166 Py_DECREF(str);
9167 Py_DECREF(substr);
9168 return result;
9169}
9170
Guido van Rossumd57fd912000-03-10 22:53:23 +00009171/* Apply fixfct filter to the Unicode object self and return a
9172 reference to the modified object */
9173
Alexander Belopolsky40018472011-02-26 01:02:56 +00009174static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009175fixup(PyObject *self,
9176 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009177{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009178 PyObject *u;
9179 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009180 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009182 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009184 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009185 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 /* fix functions return the new maximum character in a string,
9188 if the kind of the resulting unicode object does not change,
9189 everything is fine. Otherwise we need to change the string kind
9190 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009191 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009192
9193 if (maxchar_new == 0) {
9194 /* no changes */;
9195 if (PyUnicode_CheckExact(self)) {
9196 Py_DECREF(u);
9197 Py_INCREF(self);
9198 return self;
9199 }
9200 else
9201 return u;
9202 }
9203
Victor Stinnere6abb482012-05-02 01:15:40 +02009204 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009205
Victor Stinnereaab6042011-12-11 22:22:39 +01009206 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009207 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009208
9209 /* In case the maximum character changed, we need to
9210 convert the string to the new category. */
9211 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9212 if (v == NULL) {
9213 Py_DECREF(u);
9214 return NULL;
9215 }
9216 if (maxchar_new > maxchar_old) {
9217 /* If the maxchar increased so that the kind changed, not all
9218 characters are representable anymore and we need to fix the
9219 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009220 _PyUnicode_FastCopyCharacters(v, 0,
9221 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009222 maxchar_old = fixfct(v);
9223 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224 }
9225 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009226 _PyUnicode_FastCopyCharacters(v, 0,
9227 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009228 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009229 Py_DECREF(u);
9230 assert(_PyUnicode_CheckConsistency(v, 1));
9231 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232}
9233
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009234static PyObject *
9235ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009237 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9238 char *resdata, *data = PyUnicode_DATA(self);
9239 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009240
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009241 res = PyUnicode_New(len, 127);
9242 if (res == NULL)
9243 return NULL;
9244 resdata = PyUnicode_DATA(res);
9245 if (lower)
9246 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009248 _Py_bytes_upper(resdata, data, len);
9249 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250}
9251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009252static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009253handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009254{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009255 Py_ssize_t j;
9256 int final_sigma;
9257 Py_UCS4 c;
9258 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009259
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009260 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9261
9262 where ! is a negation and \p{xxx} is a character with property xxx.
9263 */
9264 for (j = i - 1; j >= 0; j--) {
9265 c = PyUnicode_READ(kind, data, j);
9266 if (!_PyUnicode_IsCaseIgnorable(c))
9267 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009269 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9270 if (final_sigma) {
9271 for (j = i + 1; j < length; j++) {
9272 c = PyUnicode_READ(kind, data, j);
9273 if (!_PyUnicode_IsCaseIgnorable(c))
9274 break;
9275 }
9276 final_sigma = j == length || !_PyUnicode_IsCased(c);
9277 }
9278 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279}
9280
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009281static int
9282lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9283 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009285 /* Obscure special case. */
9286 if (c == 0x3A3) {
9287 mapped[0] = handle_capital_sigma(kind, data, length, i);
9288 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009290 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009291}
9292
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009293static Py_ssize_t
9294do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009295{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009296 Py_ssize_t i, k = 0;
9297 int n_res, j;
9298 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009299
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009300 c = PyUnicode_READ(kind, data, 0);
9301 n_res = _PyUnicode_ToUpperFull(c, mapped);
9302 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009303 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009304 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009306 for (i = 1; i < length; i++) {
9307 c = PyUnicode_READ(kind, data, i);
9308 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9309 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009310 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009311 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009312 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009313 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009314 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009315}
9316
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009317static Py_ssize_t
9318do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9319 Py_ssize_t i, k = 0;
9320
9321 for (i = 0; i < length; i++) {
9322 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9323 int n_res, j;
9324 if (Py_UNICODE_ISUPPER(c)) {
9325 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9326 }
9327 else if (Py_UNICODE_ISLOWER(c)) {
9328 n_res = _PyUnicode_ToUpperFull(c, mapped);
9329 }
9330 else {
9331 n_res = 1;
9332 mapped[0] = c;
9333 }
9334 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009335 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009336 res[k++] = mapped[j];
9337 }
9338 }
9339 return k;
9340}
9341
9342static Py_ssize_t
9343do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9344 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009345{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009346 Py_ssize_t i, k = 0;
9347
9348 for (i = 0; i < length; i++) {
9349 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9350 int n_res, j;
9351 if (lower)
9352 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9353 else
9354 n_res = _PyUnicode_ToUpperFull(c, mapped);
9355 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009356 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009357 res[k++] = mapped[j];
9358 }
9359 }
9360 return k;
9361}
9362
9363static Py_ssize_t
9364do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9365{
9366 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9367}
9368
9369static Py_ssize_t
9370do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9371{
9372 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9373}
9374
Benjamin Petersone51757f2012-01-12 21:10:29 -05009375static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009376do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9377{
9378 Py_ssize_t i, k = 0;
9379
9380 for (i = 0; i < length; i++) {
9381 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9382 Py_UCS4 mapped[3];
9383 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9384 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009385 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009386 res[k++] = mapped[j];
9387 }
9388 }
9389 return k;
9390}
9391
9392static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009393do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9394{
9395 Py_ssize_t i, k = 0;
9396 int previous_is_cased;
9397
9398 previous_is_cased = 0;
9399 for (i = 0; i < length; i++) {
9400 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9401 Py_UCS4 mapped[3];
9402 int n_res, j;
9403
9404 if (previous_is_cased)
9405 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9406 else
9407 n_res = _PyUnicode_ToTitleFull(c, mapped);
9408
9409 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009410 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009411 res[k++] = mapped[j];
9412 }
9413
9414 previous_is_cased = _PyUnicode_IsCased(c);
9415 }
9416 return k;
9417}
9418
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009419static PyObject *
9420case_operation(PyObject *self,
9421 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9422{
9423 PyObject *res = NULL;
9424 Py_ssize_t length, newlength = 0;
9425 int kind, outkind;
9426 void *data, *outdata;
9427 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9428
Benjamin Petersoneea48462012-01-16 14:28:50 -05009429 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009430
9431 kind = PyUnicode_KIND(self);
9432 data = PyUnicode_DATA(self);
9433 length = PyUnicode_GET_LENGTH(self);
9434 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9435 if (tmp == NULL)
9436 return PyErr_NoMemory();
9437 newlength = perform(kind, data, length, tmp, &maxchar);
9438 res = PyUnicode_New(newlength, maxchar);
9439 if (res == NULL)
9440 goto leave;
9441 tmpend = tmp + newlength;
9442 outdata = PyUnicode_DATA(res);
9443 outkind = PyUnicode_KIND(res);
9444 switch (outkind) {
9445 case PyUnicode_1BYTE_KIND:
9446 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9447 break;
9448 case PyUnicode_2BYTE_KIND:
9449 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9450 break;
9451 case PyUnicode_4BYTE_KIND:
9452 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9453 break;
9454 default:
9455 assert(0);
9456 break;
9457 }
9458 leave:
9459 PyMem_FREE(tmp);
9460 return res;
9461}
9462
Tim Peters8ce9f162004-08-27 01:49:32 +00009463PyObject *
9464PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009467 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009469 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009470 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9471 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009472 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009474 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009476 int use_memcpy;
9477 unsigned char *res_data = NULL, *sep_data = NULL;
9478 PyObject *last_obj;
9479 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480
Tim Peters05eba1f2004-08-27 21:32:02 +00009481 fseq = PySequence_Fast(seq, "");
9482 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009483 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009484 }
9485
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009486 /* NOTE: the following code can't call back into Python code,
9487 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009488 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009489
Tim Peters05eba1f2004-08-27 21:32:02 +00009490 seqlen = PySequence_Fast_GET_SIZE(fseq);
9491 /* If empty sequence, return u"". */
9492 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009493 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009494 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009495 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009496
Tim Peters05eba1f2004-08-27 21:32:02 +00009497 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009498 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009499 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009500 if (seqlen == 1) {
9501 if (PyUnicode_CheckExact(items[0])) {
9502 res = items[0];
9503 Py_INCREF(res);
9504 Py_DECREF(fseq);
9505 return res;
9506 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009507 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009508 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009509 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009510 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009511 /* Set up sep and seplen */
9512 if (separator == NULL) {
9513 /* fall back to a blank space separator */
9514 sep = PyUnicode_FromOrdinal(' ');
9515 if (!sep)
9516 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009517 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009518 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009519 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009520 else {
9521 if (!PyUnicode_Check(separator)) {
9522 PyErr_Format(PyExc_TypeError,
9523 "separator: expected str instance,"
9524 " %.80s found",
9525 Py_TYPE(separator)->tp_name);
9526 goto onError;
9527 }
9528 if (PyUnicode_READY(separator))
9529 goto onError;
9530 sep = separator;
9531 seplen = PyUnicode_GET_LENGTH(separator);
9532 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9533 /* inc refcount to keep this code path symmetric with the
9534 above case of a blank separator */
9535 Py_INCREF(sep);
9536 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009537 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009538 }
9539
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009540 /* There are at least two things to join, or else we have a subclass
9541 * of str in the sequence.
9542 * Do a pre-pass to figure out the total amount of space we'll
9543 * need (sz), and see whether all argument are strings.
9544 */
9545 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009546#ifdef Py_DEBUG
9547 use_memcpy = 0;
9548#else
9549 use_memcpy = 1;
9550#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009551 for (i = 0; i < seqlen; i++) {
9552 const Py_ssize_t old_sz = sz;
9553 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009554 if (!PyUnicode_Check(item)) {
9555 PyErr_Format(PyExc_TypeError,
9556 "sequence item %zd: expected str instance,"
9557 " %.80s found",
9558 i, Py_TYPE(item)->tp_name);
9559 goto onError;
9560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 if (PyUnicode_READY(item) == -1)
9562 goto onError;
9563 sz += PyUnicode_GET_LENGTH(item);
9564 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009565 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009566 if (i != 0)
9567 sz += seplen;
9568 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9569 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009570 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009571 goto onError;
9572 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009573 if (use_memcpy && last_obj != NULL) {
9574 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9575 use_memcpy = 0;
9576 }
9577 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009578 }
Tim Petersced69f82003-09-16 20:30:58 +00009579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009580 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009581 if (res == NULL)
9582 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009583
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009584 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009585#ifdef Py_DEBUG
9586 use_memcpy = 0;
9587#else
9588 if (use_memcpy) {
9589 res_data = PyUnicode_1BYTE_DATA(res);
9590 kind = PyUnicode_KIND(res);
9591 if (seplen != 0)
9592 sep_data = PyUnicode_1BYTE_DATA(sep);
9593 }
9594#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009596 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009597 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009598 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009599 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009600 if (use_memcpy) {
9601 Py_MEMCPY(res_data,
9602 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009603 kind * seplen);
9604 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009605 }
9606 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009607 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009608 res_offset += seplen;
9609 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009610 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009611 itemlen = PyUnicode_GET_LENGTH(item);
9612 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009613 if (use_memcpy) {
9614 Py_MEMCPY(res_data,
9615 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009616 kind * itemlen);
9617 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009618 }
9619 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009620 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009621 res_offset += itemlen;
9622 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009623 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009624 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009625 if (use_memcpy)
9626 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009627 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009628 else
9629 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009630
Tim Peters05eba1f2004-08-27 21:32:02 +00009631 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009633 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635
Benjamin Peterson29060642009-01-31 22:14:21 +00009636 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009637 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009639 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640 return NULL;
9641}
9642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009643#define FILL(kind, data, value, start, length) \
9644 do { \
9645 Py_ssize_t i_ = 0; \
9646 assert(kind != PyUnicode_WCHAR_KIND); \
9647 switch ((kind)) { \
9648 case PyUnicode_1BYTE_KIND: { \
9649 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009650 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 break; \
9652 } \
9653 case PyUnicode_2BYTE_KIND: { \
9654 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9655 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9656 break; \
9657 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009658 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9660 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9661 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009662 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009663 } \
9664 } \
9665 } while (0)
9666
Victor Stinnerd3f08822012-05-29 12:57:52 +02009667void
9668_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9669 Py_UCS4 fill_char)
9670{
9671 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9672 const void *data = PyUnicode_DATA(unicode);
9673 assert(PyUnicode_IS_READY(unicode));
9674 assert(unicode_modifiable(unicode));
9675 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9676 assert(start >= 0);
9677 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9678 FILL(kind, data, fill_char, start, length);
9679}
9680
Victor Stinner3fe55312012-01-04 00:33:50 +01009681Py_ssize_t
9682PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9683 Py_UCS4 fill_char)
9684{
9685 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009686
9687 if (!PyUnicode_Check(unicode)) {
9688 PyErr_BadInternalCall();
9689 return -1;
9690 }
9691 if (PyUnicode_READY(unicode) == -1)
9692 return -1;
9693 if (unicode_check_modifiable(unicode))
9694 return -1;
9695
Victor Stinnerd3f08822012-05-29 12:57:52 +02009696 if (start < 0) {
9697 PyErr_SetString(PyExc_IndexError, "string index out of range");
9698 return -1;
9699 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009700 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9701 PyErr_SetString(PyExc_ValueError,
9702 "fill character is bigger than "
9703 "the string maximum character");
9704 return -1;
9705 }
9706
9707 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9708 length = Py_MIN(maxlen, length);
9709 if (length <= 0)
9710 return 0;
9711
Victor Stinnerd3f08822012-05-29 12:57:52 +02009712 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009713 return length;
9714}
9715
Victor Stinner9310abb2011-10-05 00:59:23 +02009716static PyObject *
9717pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009718 Py_ssize_t left,
9719 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 PyObject *u;
9723 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009724 int kind;
9725 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726
9727 if (left < 0)
9728 left = 0;
9729 if (right < 0)
9730 right = 0;
9731
Victor Stinnerc4b49542011-12-11 22:44:26 +01009732 if (left == 0 && right == 0)
9733 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9736 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009737 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9738 return NULL;
9739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009741 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009742 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009743 if (!u)
9744 return NULL;
9745
9746 kind = PyUnicode_KIND(u);
9747 data = PyUnicode_DATA(u);
9748 if (left)
9749 FILL(kind, data, fill, 0, left);
9750 if (right)
9751 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009752 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009753 assert(_PyUnicode_CheckConsistency(u, 1));
9754 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755}
9756
Alexander Belopolsky40018472011-02-26 01:02:56 +00009757PyObject *
9758PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761
9762 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009763 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009764 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009765 if (PyUnicode_READY(string) == -1) {
9766 Py_DECREF(string);
9767 return NULL;
9768 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009769
Benjamin Petersonead6b532011-12-20 17:23:42 -06009770 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009771 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009772 if (PyUnicode_IS_ASCII(string))
9773 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009774 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009775 PyUnicode_GET_LENGTH(string), keepends);
9776 else
9777 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009778 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009779 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 break;
9781 case PyUnicode_2BYTE_KIND:
9782 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009783 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 PyUnicode_GET_LENGTH(string), keepends);
9785 break;
9786 case PyUnicode_4BYTE_KIND:
9787 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009788 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 PyUnicode_GET_LENGTH(string), keepends);
9790 break;
9791 default:
9792 assert(0);
9793 list = 0;
9794 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795 Py_DECREF(string);
9796 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797}
9798
Alexander Belopolsky40018472011-02-26 01:02:56 +00009799static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009800split(PyObject *self,
9801 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009802 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 int kind1, kind2, kind;
9805 void *buf1, *buf2;
9806 Py_ssize_t len1, len2;
9807 PyObject* out;
9808
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009810 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 if (PyUnicode_READY(self) == -1)
9813 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009816 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009818 if (PyUnicode_IS_ASCII(self))
9819 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009820 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009821 PyUnicode_GET_LENGTH(self), maxcount
9822 );
9823 else
9824 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009825 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009826 PyUnicode_GET_LENGTH(self), maxcount
9827 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 case PyUnicode_2BYTE_KIND:
9829 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009830 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 PyUnicode_GET_LENGTH(self), maxcount
9832 );
9833 case PyUnicode_4BYTE_KIND:
9834 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009835 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009836 PyUnicode_GET_LENGTH(self), maxcount
9837 );
9838 default:
9839 assert(0);
9840 return NULL;
9841 }
9842
9843 if (PyUnicode_READY(substring) == -1)
9844 return NULL;
9845
9846 kind1 = PyUnicode_KIND(self);
9847 kind2 = PyUnicode_KIND(substring);
9848 kind = kind1 > kind2 ? kind1 : kind2;
9849 buf1 = PyUnicode_DATA(self);
9850 buf2 = PyUnicode_DATA(substring);
9851 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009852 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 if (!buf1)
9854 return NULL;
9855 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009856 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 if (!buf2) {
9858 if (kind1 != kind) PyMem_Free(buf1);
9859 return NULL;
9860 }
9861 len1 = PyUnicode_GET_LENGTH(self);
9862 len2 = PyUnicode_GET_LENGTH(substring);
9863
Benjamin Petersonead6b532011-12-20 17:23:42 -06009864 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009866 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9867 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009868 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009869 else
9870 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009871 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 break;
9873 case PyUnicode_2BYTE_KIND:
9874 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009875 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 break;
9877 case PyUnicode_4BYTE_KIND:
9878 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009879 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 break;
9881 default:
9882 out = NULL;
9883 }
9884 if (kind1 != kind)
9885 PyMem_Free(buf1);
9886 if (kind2 != kind)
9887 PyMem_Free(buf2);
9888 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889}
9890
Alexander Belopolsky40018472011-02-26 01:02:56 +00009891static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009892rsplit(PyObject *self,
9893 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009894 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009895{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 int kind1, kind2, kind;
9897 void *buf1, *buf2;
9898 Py_ssize_t len1, len2;
9899 PyObject* out;
9900
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009901 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009902 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009904 if (PyUnicode_READY(self) == -1)
9905 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009908 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009909 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009910 if (PyUnicode_IS_ASCII(self))
9911 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009912 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009913 PyUnicode_GET_LENGTH(self), maxcount
9914 );
9915 else
9916 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009917 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009918 PyUnicode_GET_LENGTH(self), maxcount
9919 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 case PyUnicode_2BYTE_KIND:
9921 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009922 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 PyUnicode_GET_LENGTH(self), maxcount
9924 );
9925 case PyUnicode_4BYTE_KIND:
9926 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009927 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009928 PyUnicode_GET_LENGTH(self), maxcount
9929 );
9930 default:
9931 assert(0);
9932 return NULL;
9933 }
9934
9935 if (PyUnicode_READY(substring) == -1)
9936 return NULL;
9937
9938 kind1 = PyUnicode_KIND(self);
9939 kind2 = PyUnicode_KIND(substring);
9940 kind = kind1 > kind2 ? kind1 : kind2;
9941 buf1 = PyUnicode_DATA(self);
9942 buf2 = PyUnicode_DATA(substring);
9943 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009944 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 if (!buf1)
9946 return NULL;
9947 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009948 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 if (!buf2) {
9950 if (kind1 != kind) PyMem_Free(buf1);
9951 return NULL;
9952 }
9953 len1 = PyUnicode_GET_LENGTH(self);
9954 len2 = PyUnicode_GET_LENGTH(substring);
9955
Benjamin Petersonead6b532011-12-20 17:23:42 -06009956 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009958 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9959 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009960 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009961 else
9962 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009963 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 break;
9965 case PyUnicode_2BYTE_KIND:
9966 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009967 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 break;
9969 case PyUnicode_4BYTE_KIND:
9970 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009971 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 break;
9973 default:
9974 out = NULL;
9975 }
9976 if (kind1 != kind)
9977 PyMem_Free(buf1);
9978 if (kind2 != kind)
9979 PyMem_Free(buf2);
9980 return out;
9981}
9982
9983static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009984anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9985 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009987 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009989 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9990 return asciilib_find(buf1, len1, buf2, len2, offset);
9991 else
9992 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 case PyUnicode_2BYTE_KIND:
9994 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9995 case PyUnicode_4BYTE_KIND:
9996 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9997 }
9998 assert(0);
9999 return -1;
10000}
10001
10002static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010003anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10004 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010006 switch (kind) {
10007 case PyUnicode_1BYTE_KIND:
10008 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10009 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10010 else
10011 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10012 case PyUnicode_2BYTE_KIND:
10013 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10014 case PyUnicode_4BYTE_KIND:
10015 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10016 }
10017 assert(0);
10018 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010019}
10020
Alexander Belopolsky40018472011-02-26 01:02:56 +000010021static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022replace(PyObject *self, PyObject *str1,
10023 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010024{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 PyObject *u;
10026 char *sbuf = PyUnicode_DATA(self);
10027 char *buf1 = PyUnicode_DATA(str1);
10028 char *buf2 = PyUnicode_DATA(str2);
10029 int srelease = 0, release1 = 0, release2 = 0;
10030 int skind = PyUnicode_KIND(self);
10031 int kind1 = PyUnicode_KIND(str1);
10032 int kind2 = PyUnicode_KIND(str2);
10033 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10034 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10035 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010036 int mayshrink;
10037 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038
10039 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010040 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010042 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010043
Victor Stinner59de0ee2011-10-07 10:01:28 +020010044 if (str1 == str2)
10045 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046 if (skind < kind1)
10047 /* substring too wide to be present */
10048 goto nothing;
10049
Victor Stinner49a0a212011-10-12 23:46:10 +020010050 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10051 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10052 /* Replacing str1 with str2 may cause a maxchar reduction in the
10053 result string. */
10054 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +020010055 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010058 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010060 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010062 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010063 Py_UCS4 u1, u2;
10064 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010065 Py_ssize_t index, pos;
10066 char *src;
10067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010069 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10070 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010071 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010074 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010076 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010078
10079 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10080 index = 0;
10081 src = sbuf;
10082 while (--maxcount)
10083 {
10084 pos++;
10085 src += pos * PyUnicode_KIND(self);
10086 slen -= pos;
10087 index += pos;
10088 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10089 if (pos < 0)
10090 break;
10091 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10092 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010093 }
10094 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 int rkind = skind;
10096 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010097 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 if (kind1 < rkind) {
10100 /* widen substring */
10101 buf1 = _PyUnicode_AsKind(str1, rkind);
10102 if (!buf1) goto error;
10103 release1 = 1;
10104 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010105 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010106 if (i < 0)
10107 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 if (rkind > kind2) {
10109 /* widen replacement */
10110 buf2 = _PyUnicode_AsKind(str2, rkind);
10111 if (!buf2) goto error;
10112 release2 = 1;
10113 }
10114 else if (rkind < kind2) {
10115 /* widen self and buf1 */
10116 rkind = kind2;
10117 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010118 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 sbuf = _PyUnicode_AsKind(self, rkind);
10120 if (!sbuf) goto error;
10121 srelease = 1;
10122 buf1 = _PyUnicode_AsKind(str1, rkind);
10123 if (!buf1) goto error;
10124 release1 = 1;
10125 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010126 u = PyUnicode_New(slen, maxchar);
10127 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010129 assert(PyUnicode_KIND(u) == rkind);
10130 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010131
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010132 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010133 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010134 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010136 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010138
10139 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010140 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010141 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010142 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010143 if (i == -1)
10144 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010145 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010147 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010151 }
10152 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010154 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 int rkind = skind;
10156 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010159 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 buf1 = _PyUnicode_AsKind(str1, rkind);
10161 if (!buf1) goto error;
10162 release1 = 1;
10163 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010164 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010165 if (n == 0)
10166 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010168 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 buf2 = _PyUnicode_AsKind(str2, rkind);
10170 if (!buf2) goto error;
10171 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010174 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 rkind = kind2;
10176 sbuf = _PyUnicode_AsKind(self, rkind);
10177 if (!sbuf) goto error;
10178 srelease = 1;
10179 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010180 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 buf1 = _PyUnicode_AsKind(str1, rkind);
10182 if (!buf1) goto error;
10183 release1 = 1;
10184 }
10185 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10186 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010187 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 PyErr_SetString(PyExc_OverflowError,
10189 "replace string is too long");
10190 goto error;
10191 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010192 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010193 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010194 _Py_INCREF_UNICODE_EMPTY();
10195 if (!unicode_empty)
10196 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010197 u = unicode_empty;
10198 goto done;
10199 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010200 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 PyErr_SetString(PyExc_OverflowError,
10202 "replace string is too long");
10203 goto error;
10204 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010205 u = PyUnicode_New(new_size, maxchar);
10206 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010208 assert(PyUnicode_KIND(u) == rkind);
10209 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 ires = i = 0;
10211 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010212 while (n-- > 0) {
10213 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010214 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010215 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010216 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010217 if (j == -1)
10218 break;
10219 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010220 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010221 memcpy(res + rkind * ires,
10222 sbuf + rkind * i,
10223 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010225 }
10226 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010228 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010230 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010236 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010237 memcpy(res + rkind * ires,
10238 sbuf + rkind * i,
10239 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010240 }
10241 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010242 /* interleave */
10243 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010244 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010246 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010248 if (--n <= 0)
10249 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010250 memcpy(res + rkind * ires,
10251 sbuf + rkind * i,
10252 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 ires++;
10254 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010255 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010256 memcpy(res + rkind * ires,
10257 sbuf + rkind * i,
10258 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010259 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010260 }
10261
10262 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010263 unicode_adjust_maxchar(&u);
10264 if (u == NULL)
10265 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010267
10268 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 if (srelease)
10270 PyMem_FREE(sbuf);
10271 if (release1)
10272 PyMem_FREE(buf1);
10273 if (release2)
10274 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010275 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010277
Benjamin Peterson29060642009-01-31 22:14:21 +000010278 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010279 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 if (srelease)
10281 PyMem_FREE(sbuf);
10282 if (release1)
10283 PyMem_FREE(buf1);
10284 if (release2)
10285 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010286 return unicode_result_unchanged(self);
10287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 error:
10289 if (srelease && sbuf)
10290 PyMem_FREE(sbuf);
10291 if (release1 && buf1)
10292 PyMem_FREE(buf1);
10293 if (release2 && buf2)
10294 PyMem_FREE(buf2);
10295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010296}
10297
10298/* --- Unicode Object Methods --------------------------------------------- */
10299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010300PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010301 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302\n\
10303Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010304characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010305
10306static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010307unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010309 if (PyUnicode_READY(self) == -1)
10310 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010311 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312}
10313
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010314PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010315 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316\n\
10317Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010318have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319
10320static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010321unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010323 if (PyUnicode_READY(self) == -1)
10324 return NULL;
10325 if (PyUnicode_GET_LENGTH(self) == 0)
10326 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010327 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328}
10329
Benjamin Petersond5890c82012-01-14 13:23:30 -050010330PyDoc_STRVAR(casefold__doc__,
10331 "S.casefold() -> str\n\
10332\n\
10333Return a version of S suitable for caseless comparisons.");
10334
10335static PyObject *
10336unicode_casefold(PyObject *self)
10337{
10338 if (PyUnicode_READY(self) == -1)
10339 return NULL;
10340 if (PyUnicode_IS_ASCII(self))
10341 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010342 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010343}
10344
10345
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010346/* Argument converter. Coerces to a single unicode character */
10347
10348static int
10349convert_uc(PyObject *obj, void *addr)
10350{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010352 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010353
Benjamin Peterson14339b62009-01-31 16:36:08 +000010354 uniobj = PyUnicode_FromObject(obj);
10355 if (uniobj == NULL) {
10356 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010357 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010358 return 0;
10359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010361 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010362 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010363 Py_DECREF(uniobj);
10364 return 0;
10365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010367 Py_DECREF(uniobj);
10368 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010369}
10370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010371PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010372 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010373\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010374Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010375done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376
10377static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010378unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010380 Py_ssize_t marg, left;
10381 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 Py_UCS4 fillchar = ' ';
10383
Victor Stinnere9a29352011-10-01 02:14:59 +020010384 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386
Benjamin Petersonbac79492012-01-14 13:34:47 -050010387 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388 return NULL;
10389
Victor Stinnerc4b49542011-12-11 22:44:26 +010010390 if (PyUnicode_GET_LENGTH(self) >= width)
10391 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392
Victor Stinnerc4b49542011-12-11 22:44:26 +010010393 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394 left = marg / 2 + (marg & width & 1);
10395
Victor Stinner9310abb2011-10-05 00:59:23 +020010396 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397}
10398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399/* This function assumes that str1 and str2 are readied by the caller. */
10400
Marc-André Lemburge5034372000-08-08 08:04:29 +000010401static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010402unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010403{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 int kind1, kind2;
10405 void *data1, *data2;
10406 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 kind1 = PyUnicode_KIND(str1);
10409 kind2 = PyUnicode_KIND(str2);
10410 data1 = PyUnicode_DATA(str1);
10411 data2 = PyUnicode_DATA(str2);
10412 len1 = PyUnicode_GET_LENGTH(str1);
10413 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 for (i = 0; i < len1 && i < len2; ++i) {
10416 Py_UCS4 c1, c2;
10417 c1 = PyUnicode_READ(kind1, data1, i);
10418 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010419
10420 if (c1 != c2)
10421 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010422 }
10423
10424 return (len1 < len2) ? -1 : (len1 != len2);
10425}
10426
Alexander Belopolsky40018472011-02-26 01:02:56 +000010427int
10428PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010429{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10431 if (PyUnicode_READY(left) == -1 ||
10432 PyUnicode_READY(right) == -1)
10433 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010434 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010436 PyErr_Format(PyExc_TypeError,
10437 "Can't compare %.100s and %.100s",
10438 left->ob_type->tp_name,
10439 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440 return -1;
10441}
10442
Martin v. Löwis5b222132007-06-10 09:51:05 +000010443int
10444PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10445{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 Py_ssize_t i;
10447 int kind;
10448 void *data;
10449 Py_UCS4 chr;
10450
Victor Stinner910337b2011-10-03 03:20:16 +020010451 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 if (PyUnicode_READY(uni) == -1)
10453 return -1;
10454 kind = PyUnicode_KIND(uni);
10455 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010456 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10458 if (chr != str[i])
10459 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010460 /* This check keeps Python strings that end in '\0' from comparing equal
10461 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010463 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010464 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010465 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010466 return 0;
10467}
10468
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010469
Benjamin Peterson29060642009-01-31 22:14:21 +000010470#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010471 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010472
Alexander Belopolsky40018472011-02-26 01:02:56 +000010473PyObject *
10474PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010475{
10476 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010477
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010478 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10479 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 if (PyUnicode_READY(left) == -1 ||
10481 PyUnicode_READY(right) == -1)
10482 return NULL;
10483 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10484 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010485 if (op == Py_EQ) {
10486 Py_INCREF(Py_False);
10487 return Py_False;
10488 }
10489 if (op == Py_NE) {
10490 Py_INCREF(Py_True);
10491 return Py_True;
10492 }
10493 }
10494 if (left == right)
10495 result = 0;
10496 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010497 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010498
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010499 /* Convert the return value to a Boolean */
10500 switch (op) {
10501 case Py_EQ:
10502 v = TEST_COND(result == 0);
10503 break;
10504 case Py_NE:
10505 v = TEST_COND(result != 0);
10506 break;
10507 case Py_LE:
10508 v = TEST_COND(result <= 0);
10509 break;
10510 case Py_GE:
10511 v = TEST_COND(result >= 0);
10512 break;
10513 case Py_LT:
10514 v = TEST_COND(result == -1);
10515 break;
10516 case Py_GT:
10517 v = TEST_COND(result == 1);
10518 break;
10519 default:
10520 PyErr_BadArgument();
10521 return NULL;
10522 }
10523 Py_INCREF(v);
10524 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010525 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010526
Brian Curtindfc80e32011-08-10 20:28:54 -050010527 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010528}
10529
Alexander Belopolsky40018472011-02-26 01:02:56 +000010530int
10531PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010532{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010533 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 int kind1, kind2, kind;
10535 void *buf1, *buf2;
10536 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010537 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010538
10539 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010540 sub = PyUnicode_FromObject(element);
10541 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010542 PyErr_Format(PyExc_TypeError,
10543 "'in <string>' requires string as left operand, not %s",
10544 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010545 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010546 }
10547
Thomas Wouters477c8d52006-05-27 19:21:47 +000010548 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010549 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010550 Py_DECREF(sub);
10551 return -1;
10552 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010553 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10554 Py_DECREF(sub);
10555 Py_DECREF(str);
10556 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 kind1 = PyUnicode_KIND(str);
10559 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010560 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 buf1 = PyUnicode_DATA(str);
10562 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010563 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010564 if (kind2 > kind) {
10565 Py_DECREF(sub);
10566 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010567 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010568 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010569 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010570 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 if (!buf2) {
10572 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010573 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 return -1;
10575 }
10576 len1 = PyUnicode_GET_LENGTH(str);
10577 len2 = PyUnicode_GET_LENGTH(sub);
10578
Benjamin Petersonead6b532011-12-20 17:23:42 -060010579 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 case PyUnicode_1BYTE_KIND:
10581 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10582 break;
10583 case PyUnicode_2BYTE_KIND:
10584 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10585 break;
10586 case PyUnicode_4BYTE_KIND:
10587 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10588 break;
10589 default:
10590 result = -1;
10591 assert(0);
10592 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010593
10594 Py_DECREF(str);
10595 Py_DECREF(sub);
10596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 if (kind2 != kind)
10598 PyMem_Free(buf2);
10599
Guido van Rossum403d68b2000-03-13 15:55:09 +000010600 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010601}
10602
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603/* Concat to string or Unicode object giving a new Unicode object. */
10604
Alexander Belopolsky40018472011-02-26 01:02:56 +000010605PyObject *
10606PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010609 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010610 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611
10612 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010615 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010618 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619
10620 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010621 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010622 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010624 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010625 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010626 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628 }
10629
Victor Stinner488fa492011-12-12 00:01:39 +010010630 u_len = PyUnicode_GET_LENGTH(u);
10631 v_len = PyUnicode_GET_LENGTH(v);
10632 if (u_len > PY_SSIZE_T_MAX - v_len) {
10633 PyErr_SetString(PyExc_OverflowError,
10634 "strings are too large to concat");
10635 goto onError;
10636 }
10637 new_len = u_len + v_len;
10638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010640 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010641 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010644 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010646 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010647 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10648 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649 Py_DECREF(u);
10650 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010651 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010653
Benjamin Peterson29060642009-01-31 22:14:21 +000010654 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655 Py_XDECREF(u);
10656 Py_XDECREF(v);
10657 return NULL;
10658}
10659
Walter Dörwald1ab83302007-05-18 17:15:44 +000010660void
Victor Stinner23e56682011-10-03 03:54:37 +020010661PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010662{
Victor Stinner23e56682011-10-03 03:54:37 +020010663 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010664 Py_UCS4 maxchar, maxchar2;
10665 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010666
10667 if (p_left == NULL) {
10668 if (!PyErr_Occurred())
10669 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010670 return;
10671 }
Victor Stinner23e56682011-10-03 03:54:37 +020010672 left = *p_left;
Serhiy Storchaka6c83e732013-01-04 12:39:34 +020010673 if (right == NULL || left == NULL || !PyUnicode_Check(left)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010674 if (!PyErr_Occurred())
10675 PyErr_BadInternalCall();
10676 goto error;
10677 }
10678
Benjamin Petersonbac79492012-01-14 13:34:47 -050010679 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010680 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010681 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010682 goto error;
10683
Victor Stinner488fa492011-12-12 00:01:39 +010010684 /* Shortcuts */
10685 if (left == unicode_empty) {
10686 Py_DECREF(left);
10687 Py_INCREF(right);
10688 *p_left = right;
10689 return;
10690 }
10691 if (right == unicode_empty)
10692 return;
10693
10694 left_len = PyUnicode_GET_LENGTH(left);
10695 right_len = PyUnicode_GET_LENGTH(right);
10696 if (left_len > PY_SSIZE_T_MAX - right_len) {
10697 PyErr_SetString(PyExc_OverflowError,
10698 "strings are too large to concat");
10699 goto error;
10700 }
10701 new_len = left_len + right_len;
10702
10703 if (unicode_modifiable(left)
10704 && PyUnicode_CheckExact(right)
10705 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010706 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10707 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010708 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010709 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010710 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10711 {
10712 /* append inplace */
10713 if (unicode_resize(p_left, new_len) != 0) {
10714 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10715 * deallocated so it cannot be put back into
10716 * 'variable'. The MemoryError is raised when there
10717 * is no value in 'variable', which might (very
10718 * remotely) be a cause of incompatibilities.
10719 */
10720 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010721 }
Victor Stinner488fa492011-12-12 00:01:39 +010010722 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010723 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010724 }
Victor Stinner488fa492011-12-12 00:01:39 +010010725 else {
10726 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10727 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010728 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010729
Victor Stinner488fa492011-12-12 00:01:39 +010010730 /* Concat the two Unicode strings */
10731 res = PyUnicode_New(new_len, maxchar);
10732 if (res == NULL)
10733 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010734 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10735 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010736 Py_DECREF(left);
10737 *p_left = res;
10738 }
10739 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010740 return;
10741
10742error:
Victor Stinner488fa492011-12-12 00:01:39 +010010743 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010744}
10745
10746void
10747PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10748{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010749 PyUnicode_Append(pleft, right);
10750 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010751}
10752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010753PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010754 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010756Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010757string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010758interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759
10760static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010761unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010763 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010764 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010765 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010767 int kind1, kind2, kind;
10768 void *buf1, *buf2;
10769 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770
Jesus Ceaac451502011-04-20 17:09:23 +020010771 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10772 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010773 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 kind1 = PyUnicode_KIND(self);
10776 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010777 if (kind2 > kind1)
10778 return PyLong_FromLong(0);
10779 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 buf1 = PyUnicode_DATA(self);
10781 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010783 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 if (!buf2) {
10785 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 return NULL;
10787 }
10788 len1 = PyUnicode_GET_LENGTH(self);
10789 len2 = PyUnicode_GET_LENGTH(substring);
10790
10791 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010792 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 case PyUnicode_1BYTE_KIND:
10794 iresult = ucs1lib_count(
10795 ((Py_UCS1*)buf1) + start, end - start,
10796 buf2, len2, PY_SSIZE_T_MAX
10797 );
10798 break;
10799 case PyUnicode_2BYTE_KIND:
10800 iresult = ucs2lib_count(
10801 ((Py_UCS2*)buf1) + start, end - start,
10802 buf2, len2, PY_SSIZE_T_MAX
10803 );
10804 break;
10805 case PyUnicode_4BYTE_KIND:
10806 iresult = ucs4lib_count(
10807 ((Py_UCS4*)buf1) + start, end - start,
10808 buf2, len2, PY_SSIZE_T_MAX
10809 );
10810 break;
10811 default:
10812 assert(0); iresult = 0;
10813 }
10814
10815 result = PyLong_FromSsize_t(iresult);
10816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 if (kind2 != kind)
10818 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819
10820 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010821
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822 return result;
10823}
10824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010825PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010826 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010828Encode S using the codec registered for encoding. Default encoding\n\
10829is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010830handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010831a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10832'xmlcharrefreplace' as well as any other name registered with\n\
10833codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834
10835static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010836unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010838 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839 char *encoding = NULL;
10840 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010841
Benjamin Peterson308d6372009-09-18 21:42:35 +000010842 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10843 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010845 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010846}
10847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010848PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010849 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850\n\
10851Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010852If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853
10854static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010855unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010856{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010857 Py_ssize_t i, j, line_pos, src_len, incr;
10858 Py_UCS4 ch;
10859 PyObject *u;
10860 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010862 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010863 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864
10865 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010866 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867
Antoine Pitrou22425222011-10-04 19:10:51 +020010868 if (PyUnicode_READY(self) == -1)
10869 return NULL;
10870
Thomas Wouters7e474022000-07-16 12:04:32 +000010871 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010872 src_len = PyUnicode_GET_LENGTH(self);
10873 i = j = line_pos = 0;
10874 kind = PyUnicode_KIND(self);
10875 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010876 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010877 for (; i < src_len; i++) {
10878 ch = PyUnicode_READ(kind, src_data, i);
10879 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010880 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010881 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010882 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010883 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010884 goto overflow;
10885 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010886 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010887 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010888 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010890 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010891 goto overflow;
10892 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010894 if (ch == '\n' || ch == '\r')
10895 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010897 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010898 if (!found)
10899 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010900
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010902 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903 if (!u)
10904 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010905 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010906
Antoine Pitroue71d5742011-10-04 15:55:09 +020010907 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908
Antoine Pitroue71d5742011-10-04 15:55:09 +020010909 for (; i < src_len; i++) {
10910 ch = PyUnicode_READ(kind, src_data, i);
10911 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010912 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010913 incr = tabsize - (line_pos % tabsize);
10914 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010915 FILL(kind, dest_data, ' ', j, incr);
10916 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010917 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010918 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010919 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010920 line_pos++;
10921 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010922 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010923 if (ch == '\n' || ch == '\r')
10924 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010926 }
10927 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010928 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010929
Antoine Pitroue71d5742011-10-04 15:55:09 +020010930 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010931 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933}
10934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010935PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010936 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937\n\
10938Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010939such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940arguments start and end are interpreted as in slice notation.\n\
10941\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010942Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943
10944static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010945unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010947 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010948 Py_ssize_t start;
10949 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010950 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951
Jesus Ceaac451502011-04-20 17:09:23 +020010952 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10953 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 if (PyUnicode_READY(self) == -1)
10957 return NULL;
10958 if (PyUnicode_READY(substring) == -1)
10959 return NULL;
10960
Victor Stinner7931d9a2011-11-04 00:22:48 +010010961 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962
10963 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010965 if (result == -2)
10966 return NULL;
10967
Christian Heimes217cfd12007-12-02 14:31:20 +000010968 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969}
10970
10971static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010972unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010974 void *data;
10975 enum PyUnicode_Kind kind;
10976 Py_UCS4 ch;
10977 PyObject *res;
10978
10979 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10980 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010982 }
10983 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10984 PyErr_SetString(PyExc_IndexError, "string index out of range");
10985 return NULL;
10986 }
10987 kind = PyUnicode_KIND(self);
10988 data = PyUnicode_DATA(self);
10989 ch = PyUnicode_READ(kind, data, index);
10990 if (ch < 256)
10991 return get_latin1_char(ch);
10992
10993 res = PyUnicode_New(1, ch);
10994 if (res == NULL)
10995 return NULL;
10996 kind = PyUnicode_KIND(res);
10997 data = PyUnicode_DATA(res);
10998 PyUnicode_WRITE(kind, data, 0, ch);
10999 assert(_PyUnicode_CheckConsistency(res, 1));
11000 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001}
11002
Guido van Rossumc2504932007-09-18 19:42:40 +000011003/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011004 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011005static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011006unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007{
Guido van Rossumc2504932007-09-18 19:42:40 +000011008 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011009 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011010
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011011#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011012 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011013#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 if (_PyUnicode_HASH(self) != -1)
11015 return _PyUnicode_HASH(self);
11016 if (PyUnicode_READY(self) == -1)
11017 return -1;
11018 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011019 /*
11020 We make the hash of the empty string be 0, rather than using
11021 (prefix ^ suffix), since this slightly obfuscates the hash secret
11022 */
11023 if (len == 0) {
11024 _PyUnicode_HASH(self) = 0;
11025 return 0;
11026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027
11028 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011029#define HASH(P) \
11030 x ^= (Py_uhash_t) *P << 7; \
11031 while (--len >= 0) \
11032 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033
Georg Brandl2fb477c2012-02-21 00:33:36 +010011034 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 switch (PyUnicode_KIND(self)) {
11036 case PyUnicode_1BYTE_KIND: {
11037 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11038 HASH(c);
11039 break;
11040 }
11041 case PyUnicode_2BYTE_KIND: {
11042 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11043 HASH(s);
11044 break;
11045 }
11046 default: {
11047 Py_UCS4 *l;
11048 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11049 "Impossible switch case in unicode_hash");
11050 l = PyUnicode_4BYTE_DATA(self);
11051 HASH(l);
11052 break;
11053 }
11054 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011055 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11056 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011057
Guido van Rossumc2504932007-09-18 19:42:40 +000011058 if (x == -1)
11059 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011060 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011061 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011063#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011065PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011066 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011068Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069
11070static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011073 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011074 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011075 Py_ssize_t start;
11076 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077
Jesus Ceaac451502011-04-20 17:09:23 +020011078 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11079 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 if (PyUnicode_READY(self) == -1)
11083 return NULL;
11084 if (PyUnicode_READY(substring) == -1)
11085 return NULL;
11086
Victor Stinner7931d9a2011-11-04 00:22:48 +010011087 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088
11089 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011091 if (result == -2)
11092 return NULL;
11093
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094 if (result < 0) {
11095 PyErr_SetString(PyExc_ValueError, "substring not found");
11096 return NULL;
11097 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011098
Christian Heimes217cfd12007-12-02 14:31:20 +000011099 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100}
11101
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011102PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011103 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011105Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011106at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011107
11108static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011109unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011111 Py_ssize_t i, length;
11112 int kind;
11113 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114 int cased;
11115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116 if (PyUnicode_READY(self) == -1)
11117 return NULL;
11118 length = PyUnicode_GET_LENGTH(self);
11119 kind = PyUnicode_KIND(self);
11120 data = PyUnicode_DATA(self);
11121
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011123 if (length == 1)
11124 return PyBool_FromLong(
11125 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011127 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011128 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011129 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011130
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 for (i = 0; i < length; i++) {
11133 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011134
Benjamin Peterson29060642009-01-31 22:14:21 +000011135 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11136 return PyBool_FromLong(0);
11137 else if (!cased && Py_UNICODE_ISLOWER(ch))
11138 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011140 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141}
11142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011143PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011144 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011146Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011147at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148
11149static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011150unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152 Py_ssize_t i, length;
11153 int kind;
11154 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155 int cased;
11156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 if (PyUnicode_READY(self) == -1)
11158 return NULL;
11159 length = PyUnicode_GET_LENGTH(self);
11160 kind = PyUnicode_KIND(self);
11161 data = PyUnicode_DATA(self);
11162
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 if (length == 1)
11165 return PyBool_FromLong(
11166 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011168 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011170 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011171
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 for (i = 0; i < length; i++) {
11174 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011175
Benjamin Peterson29060642009-01-31 22:14:21 +000011176 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11177 return PyBool_FromLong(0);
11178 else if (!cased && Py_UNICODE_ISUPPER(ch))
11179 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011181 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182}
11183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011184PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011185 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011187Return True if S is a titlecased string and there is at least one\n\
11188character in S, i.e. upper- and titlecase characters may only\n\
11189follow uncased characters and lowercase characters only cased ones.\n\
11190Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191
11192static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011193unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195 Py_ssize_t i, length;
11196 int kind;
11197 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198 int cased, previous_is_cased;
11199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 if (PyUnicode_READY(self) == -1)
11201 return NULL;
11202 length = PyUnicode_GET_LENGTH(self);
11203 kind = PyUnicode_KIND(self);
11204 data = PyUnicode_DATA(self);
11205
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207 if (length == 1) {
11208 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11209 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11210 (Py_UNICODE_ISUPPER(ch) != 0));
11211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011213 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011215 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011216
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217 cased = 0;
11218 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 for (i = 0; i < length; i++) {
11220 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011221
Benjamin Peterson29060642009-01-31 22:14:21 +000011222 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11223 if (previous_is_cased)
11224 return PyBool_FromLong(0);
11225 previous_is_cased = 1;
11226 cased = 1;
11227 }
11228 else if (Py_UNICODE_ISLOWER(ch)) {
11229 if (!previous_is_cased)
11230 return PyBool_FromLong(0);
11231 previous_is_cased = 1;
11232 cased = 1;
11233 }
11234 else
11235 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011237 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238}
11239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011240PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011241 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011243Return True if all characters in S are whitespace\n\
11244and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245
11246static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011247unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011249 Py_ssize_t i, length;
11250 int kind;
11251 void *data;
11252
11253 if (PyUnicode_READY(self) == -1)
11254 return NULL;
11255 length = PyUnicode_GET_LENGTH(self);
11256 kind = PyUnicode_KIND(self);
11257 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260 if (length == 1)
11261 return PyBool_FromLong(
11262 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011264 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011266 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011268 for (i = 0; i < length; i++) {
11269 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011270 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011271 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011273 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274}
11275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011276PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011277 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011278\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011279Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011280and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011281
11282static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011283unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 Py_ssize_t i, length;
11286 int kind;
11287 void *data;
11288
11289 if (PyUnicode_READY(self) == -1)
11290 return NULL;
11291 length = PyUnicode_GET_LENGTH(self);
11292 kind = PyUnicode_KIND(self);
11293 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011294
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011295 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011296 if (length == 1)
11297 return PyBool_FromLong(
11298 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011299
11300 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011301 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011302 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 for (i = 0; i < length; i++) {
11305 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011306 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011307 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011308 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011309}
11310
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011311PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011312 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011313\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011314Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011315and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011316
11317static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011318unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011319{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320 int kind;
11321 void *data;
11322 Py_ssize_t len, i;
11323
11324 if (PyUnicode_READY(self) == -1)
11325 return NULL;
11326
11327 kind = PyUnicode_KIND(self);
11328 data = PyUnicode_DATA(self);
11329 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011330
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011331 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332 if (len == 1) {
11333 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11334 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11335 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011336
11337 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011339 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 for (i = 0; i < len; i++) {
11342 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011343 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011344 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011345 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011346 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011347}
11348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011349PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011350 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011352Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011353False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354
11355static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011356unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 Py_ssize_t i, length;
11359 int kind;
11360 void *data;
11361
11362 if (PyUnicode_READY(self) == -1)
11363 return NULL;
11364 length = PyUnicode_GET_LENGTH(self);
11365 kind = PyUnicode_KIND(self);
11366 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 if (length == 1)
11370 return PyBool_FromLong(
11371 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011373 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011375 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 for (i = 0; i < length; i++) {
11378 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011379 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011381 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382}
11383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011384PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011385 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011387Return True if all characters in S are digits\n\
11388and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389
11390static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011391unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 Py_ssize_t i, length;
11394 int kind;
11395 void *data;
11396
11397 if (PyUnicode_READY(self) == -1)
11398 return NULL;
11399 length = PyUnicode_GET_LENGTH(self);
11400 kind = PyUnicode_KIND(self);
11401 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011404 if (length == 1) {
11405 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11406 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011409 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011411 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 for (i = 0; i < length; i++) {
11414 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011417 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418}
11419
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011420PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011421 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011423Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011424False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
11426static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011427unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011429 Py_ssize_t i, length;
11430 int kind;
11431 void *data;
11432
11433 if (PyUnicode_READY(self) == -1)
11434 return NULL;
11435 length = PyUnicode_GET_LENGTH(self);
11436 kind = PyUnicode_KIND(self);
11437 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 if (length == 1)
11441 return PyBool_FromLong(
11442 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011444 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011446 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 for (i = 0; i < length; i++) {
11449 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011450 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011452 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453}
11454
Martin v. Löwis47383402007-08-15 07:32:56 +000011455int
11456PyUnicode_IsIdentifier(PyObject *self)
11457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 int kind;
11459 void *data;
11460 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011461 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 if (PyUnicode_READY(self) == -1) {
11464 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011465 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 }
11467
11468 /* Special case for empty strings */
11469 if (PyUnicode_GET_LENGTH(self) == 0)
11470 return 0;
11471 kind = PyUnicode_KIND(self);
11472 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011473
11474 /* PEP 3131 says that the first character must be in
11475 XID_Start and subsequent characters in XID_Continue,
11476 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011477 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011478 letters, digits, underscore). However, given the current
11479 definition of XID_Start and XID_Continue, it is sufficient
11480 to check just for these, except that _ must be allowed
11481 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011482 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011483 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011484 return 0;
11485
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011486 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011487 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011488 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011489 return 1;
11490}
11491
11492PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011493 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011494\n\
11495Return True if S is a valid identifier according\n\
11496to the language definition.");
11497
11498static PyObject*
11499unicode_isidentifier(PyObject *self)
11500{
11501 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11502}
11503
Georg Brandl559e5d72008-06-11 18:37:52 +000011504PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011505 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011506\n\
11507Return True if all characters in S are considered\n\
11508printable in repr() or S is empty, False otherwise.");
11509
11510static PyObject*
11511unicode_isprintable(PyObject *self)
11512{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 Py_ssize_t i, length;
11514 int kind;
11515 void *data;
11516
11517 if (PyUnicode_READY(self) == -1)
11518 return NULL;
11519 length = PyUnicode_GET_LENGTH(self);
11520 kind = PyUnicode_KIND(self);
11521 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011522
11523 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011524 if (length == 1)
11525 return PyBool_FromLong(
11526 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 for (i = 0; i < length; i++) {
11529 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011530 Py_RETURN_FALSE;
11531 }
11532 }
11533 Py_RETURN_TRUE;
11534}
11535
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011536PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011537 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538\n\
11539Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011540iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541
11542static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011543unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011545 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011546}
11547
Martin v. Löwis18e16552006-02-15 17:27:45 +000011548static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011549unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 if (PyUnicode_READY(self) == -1)
11552 return -1;
11553 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554}
11555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011556PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011559Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011560done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561
11562static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011563unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011565 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566 Py_UCS4 fillchar = ' ';
11567
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011568 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569 return NULL;
11570
Benjamin Petersonbac79492012-01-14 13:34:47 -050011571 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011572 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573
Victor Stinnerc4b49542011-12-11 22:44:26 +010011574 if (PyUnicode_GET_LENGTH(self) >= width)
11575 return unicode_result_unchanged(self);
11576
11577 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578}
11579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011580PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011583Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584
11585static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011586unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011588 if (PyUnicode_READY(self) == -1)
11589 return NULL;
11590 if (PyUnicode_IS_ASCII(self))
11591 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011592 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593}
11594
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011595#define LEFTSTRIP 0
11596#define RIGHTSTRIP 1
11597#define BOTHSTRIP 2
11598
11599/* Arrays indexed by above */
11600static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11601
11602#define STRIPNAME(i) (stripformat[i]+3)
11603
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011604/* externally visible for str.strip(unicode) */
11605PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011606_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011607{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608 void *data;
11609 int kind;
11610 Py_ssize_t i, j, len;
11611 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11614 return NULL;
11615
11616 kind = PyUnicode_KIND(self);
11617 data = PyUnicode_DATA(self);
11618 len = PyUnicode_GET_LENGTH(self);
11619 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11620 PyUnicode_DATA(sepobj),
11621 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011622
Benjamin Peterson14339b62009-01-31 16:36:08 +000011623 i = 0;
11624 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625 while (i < len &&
11626 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011627 i++;
11628 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011629 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011630
Benjamin Peterson14339b62009-01-31 16:36:08 +000011631 j = len;
11632 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 do {
11634 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 } while (j >= i &&
11636 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011637 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011638 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011639
Victor Stinner7931d9a2011-11-04 00:22:48 +010011640 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641}
11642
11643PyObject*
11644PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11645{
11646 unsigned char *data;
11647 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011648 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649
Victor Stinnerde636f32011-10-01 03:55:54 +020011650 if (PyUnicode_READY(self) == -1)
11651 return NULL;
11652
Victor Stinner684d5fd2012-05-03 02:32:34 +020011653 length = PyUnicode_GET_LENGTH(self);
11654 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011655
Victor Stinner684d5fd2012-05-03 02:32:34 +020011656 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011657 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658
Victor Stinnerde636f32011-10-01 03:55:54 +020011659 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011660 PyErr_SetString(PyExc_IndexError, "string index out of range");
11661 return NULL;
11662 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011663 if (start >= length || end < start)
11664 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011665
Victor Stinner684d5fd2012-05-03 02:32:34 +020011666 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011667 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011668 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011669 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011670 }
11671 else {
11672 kind = PyUnicode_KIND(self);
11673 data = PyUnicode_1BYTE_DATA(self);
11674 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011675 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011676 length);
11677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679
11680static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011681do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683 int kind;
11684 void *data;
11685 Py_ssize_t len, i, j;
11686
11687 if (PyUnicode_READY(self) == -1)
11688 return NULL;
11689
11690 kind = PyUnicode_KIND(self);
11691 data = PyUnicode_DATA(self);
11692 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011693
Benjamin Peterson14339b62009-01-31 16:36:08 +000011694 i = 0;
11695 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011697 i++;
11698 }
11699 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011700
Benjamin Peterson14339b62009-01-31 16:36:08 +000011701 j = len;
11702 if (striptype != LEFTSTRIP) {
11703 do {
11704 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011706 j++;
11707 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011708
Victor Stinner7931d9a2011-11-04 00:22:48 +010011709 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710}
11711
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011712
11713static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011714do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011715{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011716 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011717
Benjamin Peterson14339b62009-01-31 16:36:08 +000011718 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11719 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011720
Benjamin Peterson14339b62009-01-31 16:36:08 +000011721 if (sep != NULL && sep != Py_None) {
11722 if (PyUnicode_Check(sep))
11723 return _PyUnicode_XStrip(self, striptype, sep);
11724 else {
11725 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011726 "%s arg must be None or str",
11727 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011728 return NULL;
11729 }
11730 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011731
Benjamin Peterson14339b62009-01-31 16:36:08 +000011732 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011733}
11734
11735
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011736PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011737 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011738\n\
11739Return a copy of the string S with leading and trailing\n\
11740whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011741If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011742
11743static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011744unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011745{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011746 if (PyTuple_GET_SIZE(args) == 0)
11747 return do_strip(self, BOTHSTRIP); /* Common case */
11748 else
11749 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011750}
11751
11752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011753PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011754 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011755\n\
11756Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011757If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011758
11759static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011760unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011761{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011762 if (PyTuple_GET_SIZE(args) == 0)
11763 return do_strip(self, LEFTSTRIP); /* Common case */
11764 else
11765 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011766}
11767
11768
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011769PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011770 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011771\n\
11772Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011773If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011774
11775static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011776unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011777{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011778 if (PyTuple_GET_SIZE(args) == 0)
11779 return do_strip(self, RIGHTSTRIP); /* Common case */
11780 else
11781 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011782}
11783
11784
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011786unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011788 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790
Serhiy Storchaka05997252013-01-26 12:14:02 +020011791 if (len < 1)
11792 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793
Victor Stinnerc4b49542011-12-11 22:44:26 +010011794 /* no repeat, return original string */
11795 if (len == 1)
11796 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011797
Benjamin Petersonbac79492012-01-14 13:34:47 -050011798 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 return NULL;
11800
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011801 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011802 PyErr_SetString(PyExc_OverflowError,
11803 "repeated string is too long");
11804 return NULL;
11805 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011807
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011808 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809 if (!u)
11810 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011811 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011813 if (PyUnicode_GET_LENGTH(str) == 1) {
11814 const int kind = PyUnicode_KIND(str);
11815 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011816 if (kind == PyUnicode_1BYTE_KIND) {
11817 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011818 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011819 }
11820 else if (kind == PyUnicode_2BYTE_KIND) {
11821 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011822 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011823 ucs2[n] = fill_char;
11824 } else {
11825 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11826 assert(kind == PyUnicode_4BYTE_KIND);
11827 for (n = 0; n < len; ++n)
11828 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011829 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830 }
11831 else {
11832 /* number of characters copied this far */
11833 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011834 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011835 char *to = (char *) PyUnicode_DATA(u);
11836 Py_MEMCPY(to, PyUnicode_DATA(str),
11837 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011838 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011839 n = (done <= nchars-done) ? done : nchars-done;
11840 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011841 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011842 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843 }
11844
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011845 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011846 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847}
11848
Alexander Belopolsky40018472011-02-26 01:02:56 +000011849PyObject *
11850PyUnicode_Replace(PyObject *obj,
11851 PyObject *subobj,
11852 PyObject *replobj,
11853 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854{
11855 PyObject *self;
11856 PyObject *str1;
11857 PyObject *str2;
11858 PyObject *result;
11859
11860 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011861 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011862 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011864 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011865 Py_DECREF(self);
11866 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867 }
11868 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011869 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011870 Py_DECREF(self);
11871 Py_DECREF(str1);
11872 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011873 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011874 if (PyUnicode_READY(self) == -1 ||
11875 PyUnicode_READY(str1) == -1 ||
11876 PyUnicode_READY(str2) == -1)
11877 result = NULL;
11878 else
11879 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880 Py_DECREF(self);
11881 Py_DECREF(str1);
11882 Py_DECREF(str2);
11883 return result;
11884}
11885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011886PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011887 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888\n\
11889Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011890old replaced by new. If the optional argument count is\n\
11891given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892
11893static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 PyObject *str1;
11897 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011898 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899 PyObject *result;
11900
Martin v. Löwis18e16552006-02-15 17:27:45 +000011901 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011903 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011904 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011906 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011907 return NULL;
11908 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011909 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011910 Py_DECREF(str1);
11911 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011912 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011913 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11914 result = NULL;
11915 else
11916 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917
11918 Py_DECREF(str1);
11919 Py_DECREF(str2);
11920 return result;
11921}
11922
Alexander Belopolsky40018472011-02-26 01:02:56 +000011923static PyObject *
11924unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011926 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 Py_ssize_t isize;
11928 Py_ssize_t osize, squote, dquote, i, o;
11929 Py_UCS4 max, quote;
11930 int ikind, okind;
11931 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011934 return NULL;
11935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936 isize = PyUnicode_GET_LENGTH(unicode);
11937 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 /* Compute length of output, quote characters, and
11940 maximum character */
11941 osize = 2; /* quotes */
11942 max = 127;
11943 squote = dquote = 0;
11944 ikind = PyUnicode_KIND(unicode);
11945 for (i = 0; i < isize; i++) {
11946 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11947 switch (ch) {
11948 case '\'': squote++; osize++; break;
11949 case '"': dquote++; osize++; break;
11950 case '\\': case '\t': case '\r': case '\n':
11951 osize += 2; break;
11952 default:
11953 /* Fast-path ASCII */
11954 if (ch < ' ' || ch == 0x7f)
11955 osize += 4; /* \xHH */
11956 else if (ch < 0x7f)
11957 osize++;
11958 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11959 osize++;
11960 max = ch > max ? ch : max;
11961 }
11962 else if (ch < 0x100)
11963 osize += 4; /* \xHH */
11964 else if (ch < 0x10000)
11965 osize += 6; /* \uHHHH */
11966 else
11967 osize += 10; /* \uHHHHHHHH */
11968 }
11969 }
11970
11971 quote = '\'';
11972 if (squote) {
11973 if (dquote)
11974 /* Both squote and dquote present. Use squote,
11975 and escape them */
11976 osize += squote;
11977 else
11978 quote = '"';
11979 }
11980
11981 repr = PyUnicode_New(osize, max);
11982 if (repr == NULL)
11983 return NULL;
11984 okind = PyUnicode_KIND(repr);
11985 odata = PyUnicode_DATA(repr);
11986
11987 PyUnicode_WRITE(okind, odata, 0, quote);
11988 PyUnicode_WRITE(okind, odata, osize-1, quote);
11989
11990 for (i = 0, o = 1; i < isize; i++) {
11991 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011992
11993 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 if ((ch == quote) || (ch == '\\')) {
11995 PyUnicode_WRITE(okind, odata, o++, '\\');
11996 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011997 continue;
11998 }
11999
Benjamin Peterson29060642009-01-31 22:14:21 +000012000 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012001 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002 PyUnicode_WRITE(okind, odata, o++, '\\');
12003 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012004 }
12005 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 PyUnicode_WRITE(okind, odata, o++, '\\');
12007 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012008 }
12009 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010 PyUnicode_WRITE(okind, odata, o++, '\\');
12011 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012012 }
12013
12014 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012015 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012016 PyUnicode_WRITE(okind, odata, o++, '\\');
12017 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012018 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12019 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012020 }
12021
Georg Brandl559e5d72008-06-11 18:37:52 +000012022 /* Copy ASCII characters as-is */
12023 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012025 }
12026
Benjamin Peterson29060642009-01-31 22:14:21 +000012027 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012028 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012029 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012030 (categories Z* and C* except ASCII space)
12031 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012032 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012033 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000012034 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012037 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12038 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012039 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012040 /* Map 16-bit characters to '\uxxxx' */
12041 else if (ch <= 0xffff) {
12042 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012043 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12044 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12045 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12046 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012047 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012048 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012049 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012050 PyUnicode_WRITE(okind, odata, o++, 'U');
12051 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12052 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12053 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12054 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020012055 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12056 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12057 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12058 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012059 }
12060 }
12061 /* Copy characters as-is */
12062 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012064 }
12065 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012068 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012069 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070}
12071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012072PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012073 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012074\n\
12075Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012076such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077arguments start and end are interpreted as in slice notation.\n\
12078\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012079Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080
12081static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012084 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012085 Py_ssize_t start;
12086 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012087 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088
Jesus Ceaac451502011-04-20 17:09:23 +020012089 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12090 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012091 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012093 if (PyUnicode_READY(self) == -1)
12094 return NULL;
12095 if (PyUnicode_READY(substring) == -1)
12096 return NULL;
12097
Victor Stinner7931d9a2011-11-04 00:22:48 +010012098 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099
12100 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 if (result == -2)
12103 return NULL;
12104
Christian Heimes217cfd12007-12-02 14:31:20 +000012105 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106}
12107
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012108PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012109 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012111Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112
12113static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012116 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012117 Py_ssize_t start;
12118 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012119 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120
Jesus Ceaac451502011-04-20 17:09:23 +020012121 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12122 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012123 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 if (PyUnicode_READY(self) == -1)
12126 return NULL;
12127 if (PyUnicode_READY(substring) == -1)
12128 return NULL;
12129
Victor Stinner7931d9a2011-11-04 00:22:48 +010012130 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131
12132 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 if (result == -2)
12135 return NULL;
12136
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137 if (result < 0) {
12138 PyErr_SetString(PyExc_ValueError, "substring not found");
12139 return NULL;
12140 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141
Christian Heimes217cfd12007-12-02 14:31:20 +000012142 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143}
12144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012145PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012146 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012148Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012149done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012150
12151static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012152unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012154 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155 Py_UCS4 fillchar = ' ';
12156
Victor Stinnere9a29352011-10-01 02:14:59 +020012157 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012158 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012159
Benjamin Petersonbac79492012-01-14 13:34:47 -050012160 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161 return NULL;
12162
Victor Stinnerc4b49542011-12-11 22:44:26 +010012163 if (PyUnicode_GET_LENGTH(self) >= width)
12164 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012165
Victor Stinnerc4b49542011-12-11 22:44:26 +010012166 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167}
12168
Alexander Belopolsky40018472011-02-26 01:02:56 +000012169PyObject *
12170PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012171{
12172 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012173
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174 s = PyUnicode_FromObject(s);
12175 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012176 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012177 if (sep != NULL) {
12178 sep = PyUnicode_FromObject(sep);
12179 if (sep == NULL) {
12180 Py_DECREF(s);
12181 return NULL;
12182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183 }
12184
Victor Stinner9310abb2011-10-05 00:59:23 +020012185 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186
12187 Py_DECREF(s);
12188 Py_XDECREF(sep);
12189 return result;
12190}
12191
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012192PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012193 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194\n\
12195Return a list of the words in S, using sep as the\n\
12196delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012197splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012198whitespace string is a separator and empty strings are\n\
12199removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200
12201static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012202unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012204 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012206 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012208 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12209 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210 return NULL;
12211
12212 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012213 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012215 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012217 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218}
12219
Thomas Wouters477c8d52006-05-27 19:21:47 +000012220PyObject *
12221PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12222{
12223 PyObject* str_obj;
12224 PyObject* sep_obj;
12225 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012226 int kind1, kind2, kind;
12227 void *buf1 = NULL, *buf2 = NULL;
12228 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012229
12230 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012231 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012232 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012233 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012234 if (!sep_obj) {
12235 Py_DECREF(str_obj);
12236 return NULL;
12237 }
12238 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12239 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012240 Py_DECREF(str_obj);
12241 return NULL;
12242 }
12243
Victor Stinner14f8f022011-10-05 20:58:25 +020012244 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012245 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012246 kind = Py_MAX(kind1, kind2);
12247 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012249 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012250 if (!buf1)
12251 goto onError;
12252 buf2 = PyUnicode_DATA(sep_obj);
12253 if (kind2 != kind)
12254 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12255 if (!buf2)
12256 goto onError;
12257 len1 = PyUnicode_GET_LENGTH(str_obj);
12258 len2 = PyUnicode_GET_LENGTH(sep_obj);
12259
Benjamin Petersonead6b532011-12-20 17:23:42 -060012260 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012261 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012262 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12263 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12264 else
12265 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012266 break;
12267 case PyUnicode_2BYTE_KIND:
12268 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12269 break;
12270 case PyUnicode_4BYTE_KIND:
12271 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12272 break;
12273 default:
12274 assert(0);
12275 out = 0;
12276 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012277
12278 Py_DECREF(sep_obj);
12279 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280 if (kind1 != kind)
12281 PyMem_Free(buf1);
12282 if (kind2 != kind)
12283 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012284
12285 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 onError:
12287 Py_DECREF(sep_obj);
12288 Py_DECREF(str_obj);
12289 if (kind1 != kind && buf1)
12290 PyMem_Free(buf1);
12291 if (kind2 != kind && buf2)
12292 PyMem_Free(buf2);
12293 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012294}
12295
12296
12297PyObject *
12298PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12299{
12300 PyObject* str_obj;
12301 PyObject* sep_obj;
12302 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 int kind1, kind2, kind;
12304 void *buf1 = NULL, *buf2 = NULL;
12305 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012306
12307 str_obj = PyUnicode_FromObject(str_in);
12308 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012309 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012310 sep_obj = PyUnicode_FromObject(sep_in);
12311 if (!sep_obj) {
12312 Py_DECREF(str_obj);
12313 return NULL;
12314 }
12315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 kind1 = PyUnicode_KIND(str_in);
12317 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012318 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 buf1 = PyUnicode_DATA(str_in);
12320 if (kind1 != kind)
12321 buf1 = _PyUnicode_AsKind(str_in, kind);
12322 if (!buf1)
12323 goto onError;
12324 buf2 = PyUnicode_DATA(sep_obj);
12325 if (kind2 != kind)
12326 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12327 if (!buf2)
12328 goto onError;
12329 len1 = PyUnicode_GET_LENGTH(str_obj);
12330 len2 = PyUnicode_GET_LENGTH(sep_obj);
12331
Benjamin Petersonead6b532011-12-20 17:23:42 -060012332 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012334 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12335 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12336 else
12337 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012338 break;
12339 case PyUnicode_2BYTE_KIND:
12340 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12341 break;
12342 case PyUnicode_4BYTE_KIND:
12343 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12344 break;
12345 default:
12346 assert(0);
12347 out = 0;
12348 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012349
12350 Py_DECREF(sep_obj);
12351 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 if (kind1 != kind)
12353 PyMem_Free(buf1);
12354 if (kind2 != kind)
12355 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012356
12357 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 onError:
12359 Py_DECREF(sep_obj);
12360 Py_DECREF(str_obj);
12361 if (kind1 != kind && buf1)
12362 PyMem_Free(buf1);
12363 if (kind2 != kind && buf2)
12364 PyMem_Free(buf2);
12365 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012366}
12367
12368PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012369 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012370\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012371Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012372the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012373found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012374
12375static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012376unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012377{
Victor Stinner9310abb2011-10-05 00:59:23 +020012378 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012379}
12380
12381PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012382 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012383\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012384Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012385the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012386separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012387
12388static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012389unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012390{
Victor Stinner9310abb2011-10-05 00:59:23 +020012391 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012392}
12393
Alexander Belopolsky40018472011-02-26 01:02:56 +000012394PyObject *
12395PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012396{
12397 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012398
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012399 s = PyUnicode_FromObject(s);
12400 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012401 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012402 if (sep != NULL) {
12403 sep = PyUnicode_FromObject(sep);
12404 if (sep == NULL) {
12405 Py_DECREF(s);
12406 return NULL;
12407 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012408 }
12409
Victor Stinner9310abb2011-10-05 00:59:23 +020012410 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012411
12412 Py_DECREF(s);
12413 Py_XDECREF(sep);
12414 return result;
12415}
12416
12417PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012418 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012419\n\
12420Return a list of the words in S, using sep as the\n\
12421delimiter string, starting at the end of the string and\n\
12422working to the front. If maxsplit is given, at most maxsplit\n\
12423splits are done. If sep is not specified, any whitespace string\n\
12424is a separator.");
12425
12426static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012427unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012428{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012429 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012430 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012431 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012432
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012433 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12434 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012435 return NULL;
12436
12437 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012438 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012439 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012440 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012441 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012442 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012443}
12444
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012445PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012446 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447\n\
12448Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012449Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012450is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012451
12452static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012453unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012454{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012455 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012456 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012458 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12459 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012460 return NULL;
12461
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012462 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463}
12464
12465static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012466PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012468 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012469}
12470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012471PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012472 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012473\n\
12474Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012475and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476
12477static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012478unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012479{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012480 if (PyUnicode_READY(self) == -1)
12481 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012482 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012483}
12484
Georg Brandlceee0772007-11-27 23:48:05 +000012485PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012486 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012487\n\
12488Return a translation table usable for str.translate().\n\
12489If there is only one argument, it must be a dictionary mapping Unicode\n\
12490ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012491Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012492If there are two arguments, they must be strings of equal length, and\n\
12493in the resulting dictionary, each character in x will be mapped to the\n\
12494character at the same position in y. If there is a third argument, it\n\
12495must be a string, whose characters will be mapped to None in the result.");
12496
12497static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012498unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012499{
12500 PyObject *x, *y = NULL, *z = NULL;
12501 PyObject *new = NULL, *key, *value;
12502 Py_ssize_t i = 0;
12503 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012504
Georg Brandlceee0772007-11-27 23:48:05 +000012505 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12506 return NULL;
12507 new = PyDict_New();
12508 if (!new)
12509 return NULL;
12510 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 int x_kind, y_kind, z_kind;
12512 void *x_data, *y_data, *z_data;
12513
Georg Brandlceee0772007-11-27 23:48:05 +000012514 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012515 if (!PyUnicode_Check(x)) {
12516 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12517 "be a string if there is a second argument");
12518 goto err;
12519 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012521 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12522 "arguments must have equal length");
12523 goto err;
12524 }
12525 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526 x_kind = PyUnicode_KIND(x);
12527 y_kind = PyUnicode_KIND(y);
12528 x_data = PyUnicode_DATA(x);
12529 y_data = PyUnicode_DATA(y);
12530 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12531 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012532 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012533 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012534 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012535 if (!value) {
12536 Py_DECREF(key);
12537 goto err;
12538 }
Georg Brandlceee0772007-11-27 23:48:05 +000012539 res = PyDict_SetItem(new, key, value);
12540 Py_DECREF(key);
12541 Py_DECREF(value);
12542 if (res < 0)
12543 goto err;
12544 }
12545 /* create entries for deleting chars in z */
12546 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012547 z_kind = PyUnicode_KIND(z);
12548 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012549 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012551 if (!key)
12552 goto err;
12553 res = PyDict_SetItem(new, key, Py_None);
12554 Py_DECREF(key);
12555 if (res < 0)
12556 goto err;
12557 }
12558 }
12559 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012560 int kind;
12561 void *data;
12562
Georg Brandlceee0772007-11-27 23:48:05 +000012563 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012564 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012565 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12566 "to maketrans it must be a dict");
12567 goto err;
12568 }
12569 /* copy entries into the new dict, converting string keys to int keys */
12570 while (PyDict_Next(x, &i, &key, &value)) {
12571 if (PyUnicode_Check(key)) {
12572 /* convert string keys to integer keys */
12573 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012574 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012575 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12576 "table must be of length 1");
12577 goto err;
12578 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012579 kind = PyUnicode_KIND(key);
12580 data = PyUnicode_DATA(key);
12581 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012582 if (!newkey)
12583 goto err;
12584 res = PyDict_SetItem(new, newkey, value);
12585 Py_DECREF(newkey);
12586 if (res < 0)
12587 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012588 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012589 /* just keep integer keys */
12590 if (PyDict_SetItem(new, key, value) < 0)
12591 goto err;
12592 } else {
12593 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12594 "be strings or integers");
12595 goto err;
12596 }
12597 }
12598 }
12599 return new;
12600 err:
12601 Py_DECREF(new);
12602 return NULL;
12603}
12604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012605PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012606 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012607\n\
12608Return a copy of the string S, where all characters have been mapped\n\
12609through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012610Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012611Unmapped characters are left untouched. Characters mapped to None\n\
12612are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613
12614static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012615unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012617 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618}
12619
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012620PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012621 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012623Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624
12625static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012626unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012628 if (PyUnicode_READY(self) == -1)
12629 return NULL;
12630 if (PyUnicode_IS_ASCII(self))
12631 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012632 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633}
12634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012635PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012636 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012638Pad a numeric string S with zeros on the left, to fill a field\n\
12639of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640
12641static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012642unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012644 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012645 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012646 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647 int kind;
12648 void *data;
12649 Py_UCS4 chr;
12650
Martin v. Löwis18e16552006-02-15 17:27:45 +000012651 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652 return NULL;
12653
Benjamin Petersonbac79492012-01-14 13:34:47 -050012654 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012655 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656
Victor Stinnerc4b49542011-12-11 22:44:26 +010012657 if (PyUnicode_GET_LENGTH(self) >= width)
12658 return unicode_result_unchanged(self);
12659
12660 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661
12662 u = pad(self, fill, 0, '0');
12663
Walter Dörwald068325e2002-04-15 13:36:47 +000012664 if (u == NULL)
12665 return NULL;
12666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 kind = PyUnicode_KIND(u);
12668 data = PyUnicode_DATA(u);
12669 chr = PyUnicode_READ(kind, data, fill);
12670
12671 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 PyUnicode_WRITE(kind, data, 0, chr);
12674 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675 }
12676
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012677 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012678 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680
12681#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012682static PyObject *
12683unicode__decimal2ascii(PyObject *self)
12684{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012686}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687#endif
12688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012689PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012690 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012692Return True if S starts with the specified prefix, False otherwise.\n\
12693With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012694With optional end, stop comparing S at that position.\n\
12695prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696
12697static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012698unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012699 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012701 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012702 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012703 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012704 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012705 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706
Jesus Ceaac451502011-04-20 17:09:23 +020012707 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012708 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012709 if (PyTuple_Check(subobj)) {
12710 Py_ssize_t i;
12711 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012712 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012713 if (substring == NULL)
12714 return NULL;
12715 result = tailmatch(self, substring, start, end, -1);
12716 Py_DECREF(substring);
12717 if (result) {
12718 Py_RETURN_TRUE;
12719 }
12720 }
12721 /* nothing matched */
12722 Py_RETURN_FALSE;
12723 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012724 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012725 if (substring == NULL) {
12726 if (PyErr_ExceptionMatches(PyExc_TypeError))
12727 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12728 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012729 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012730 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012731 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012732 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012733 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734}
12735
12736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012737PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012738 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012739\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012740Return True if S ends with the specified suffix, False otherwise.\n\
12741With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012742With optional end, stop comparing S at that position.\n\
12743suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012744
12745static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012746unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012747 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012748{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012749 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012750 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012751 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012752 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012753 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012754
Jesus Ceaac451502011-04-20 17:09:23 +020012755 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012756 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012757 if (PyTuple_Check(subobj)) {
12758 Py_ssize_t i;
12759 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012760 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012761 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012762 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012763 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012764 result = tailmatch(self, substring, start, end, +1);
12765 Py_DECREF(substring);
12766 if (result) {
12767 Py_RETURN_TRUE;
12768 }
12769 }
12770 Py_RETURN_FALSE;
12771 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012772 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012773 if (substring == NULL) {
12774 if (PyErr_ExceptionMatches(PyExc_TypeError))
12775 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12776 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012777 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012778 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012779 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012780 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012781 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012782}
12783
Victor Stinner202fdca2012-05-07 12:47:02 +020012784Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012785_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012786{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012787 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012788 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12789 writer->data = PyUnicode_DATA(writer->buffer);
12790 writer->kind = PyUnicode_KIND(writer->buffer);
12791}
12792
Victor Stinnerd3f08822012-05-29 12:57:52 +020012793void
12794_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012795{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012796 memset(writer, 0, sizeof(*writer));
12797#ifdef Py_DEBUG
12798 writer->kind = 5; /* invalid kind */
12799#endif
12800 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012801 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012802}
12803
Victor Stinnerd3f08822012-05-29 12:57:52 +020012804int
12805_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12806 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012807{
12808 Py_ssize_t newlen;
12809 PyObject *newbuffer;
12810
Victor Stinnerd3f08822012-05-29 12:57:52 +020012811 assert(length > 0);
12812
Victor Stinner202fdca2012-05-07 12:47:02 +020012813 if (length > PY_SSIZE_T_MAX - writer->pos) {
12814 PyErr_NoMemory();
12815 return -1;
12816 }
12817 newlen = writer->pos + length;
12818
Victor Stinnerd3f08822012-05-29 12:57:52 +020012819 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012820 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012821 /* overallocate 25% to limit the number of resize */
12822 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12823 newlen += newlen / 4;
12824 if (newlen < writer->min_length)
12825 newlen = writer->min_length;
12826 }
12827 writer->buffer = PyUnicode_New(newlen, maxchar);
12828 if (writer->buffer == NULL)
12829 return -1;
12830 _PyUnicodeWriter_Update(writer);
12831 return 0;
12832 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012833
Victor Stinnerd3f08822012-05-29 12:57:52 +020012834 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012835 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012836 /* overallocate 25% to limit the number of resize */
12837 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12838 newlen += newlen / 4;
12839 if (newlen < writer->min_length)
12840 newlen = writer->min_length;
12841 }
12842
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012843 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012844 /* resize + widen */
12845 newbuffer = PyUnicode_New(newlen, maxchar);
12846 if (newbuffer == NULL)
12847 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012848 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12849 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012850 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012851 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012852 }
12853 else {
12854 newbuffer = resize_compact(writer->buffer, newlen);
12855 if (newbuffer == NULL)
12856 return -1;
12857 }
12858 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012859 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012860 }
12861 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012862 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012863 newbuffer = PyUnicode_New(writer->size, maxchar);
12864 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012865 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012866 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12867 writer->buffer, 0, writer->pos);
12868 Py_DECREF(writer->buffer);
12869 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012870 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012871 }
12872 return 0;
12873}
12874
Victor Stinnerd3f08822012-05-29 12:57:52 +020012875int
12876_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12877{
12878 Py_UCS4 maxchar;
12879 Py_ssize_t len;
12880
12881 if (PyUnicode_READY(str) == -1)
12882 return -1;
12883 len = PyUnicode_GET_LENGTH(str);
12884 if (len == 0)
12885 return 0;
12886 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12887 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012888 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012889 Py_INCREF(str);
12890 writer->buffer = str;
12891 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012892 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012893 writer->size = 0;
12894 writer->pos += len;
12895 return 0;
12896 }
12897 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12898 return -1;
12899 }
12900 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12901 str, 0, len);
12902 writer->pos += len;
12903 return 0;
12904}
12905
12906PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012907_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012908{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012909 if (writer->pos == 0) {
12910 Py_XDECREF(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020012911 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020012912 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012913 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012914 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12915 return writer->buffer;
12916 }
12917 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12918 PyObject *newbuffer;
12919 newbuffer = resize_compact(writer->buffer, writer->pos);
12920 if (newbuffer == NULL) {
12921 Py_DECREF(writer->buffer);
12922 return NULL;
12923 }
12924 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012925 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012926 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner202fdca2012-05-07 12:47:02 +020012927 return writer->buffer;
12928}
12929
Victor Stinnerd3f08822012-05-29 12:57:52 +020012930void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012931_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012932{
12933 Py_CLEAR(writer->buffer);
12934}
12935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012937
12938PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012939 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012940\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012941Return a formatted version of S, using substitutions from args and kwargs.\n\
12942The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012943
Eric Smith27bbca62010-11-04 17:06:58 +000012944PyDoc_STRVAR(format_map__doc__,
12945 "S.format_map(mapping) -> str\n\
12946\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012947Return a formatted version of S, using substitutions from mapping.\n\
12948The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012949
Eric Smith4a7d76d2008-05-30 18:10:19 +000012950static PyObject *
12951unicode__format__(PyObject* self, PyObject* args)
12952{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012953 PyObject *format_spec;
12954 _PyUnicodeWriter writer;
12955 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012956
12957 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12958 return NULL;
12959
Victor Stinnerd3f08822012-05-29 12:57:52 +020012960 if (PyUnicode_READY(self) == -1)
12961 return NULL;
12962 _PyUnicodeWriter_Init(&writer, 0);
12963 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12964 self, format_spec, 0,
12965 PyUnicode_GET_LENGTH(format_spec));
12966 if (ret == -1) {
12967 _PyUnicodeWriter_Dealloc(&writer);
12968 return NULL;
12969 }
12970 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012971}
12972
Eric Smith8c663262007-08-25 02:26:07 +000012973PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012974 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012975\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012976Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012977
12978static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012979unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012980{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981 Py_ssize_t size;
12982
12983 /* If it's a compact object, account for base structure +
12984 character data. */
12985 if (PyUnicode_IS_COMPACT_ASCII(v))
12986 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12987 else if (PyUnicode_IS_COMPACT(v))
12988 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012989 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012990 else {
12991 /* If it is a two-block object, account for base object, and
12992 for character block if present. */
12993 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012994 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012995 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012996 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012997 }
12998 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012999 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013000 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013001 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013002 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013003 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004
13005 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013006}
13007
13008PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013009 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013010
13011static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013012unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013013{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013014 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013015 if (!copy)
13016 return NULL;
13017 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013018}
13019
Guido van Rossumd57fd912000-03-10 22:53:23 +000013020static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013021 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013022 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013023 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13024 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013025 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13026 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013027 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013028 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13029 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13030 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13031 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13032 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013033 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013034 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13035 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13036 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013037 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013038 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13039 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13040 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013041 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013042 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013043 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013044 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013045 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13046 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13047 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13048 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13049 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13050 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13051 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13052 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13053 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13054 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13055 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13056 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13057 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13058 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013059 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013060 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013061 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013062 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013063 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013064 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013065 {"maketrans", (PyCFunction) unicode_maketrans,
13066 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013067 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013068#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013069 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013070 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071#endif
13072
Benjamin Peterson14339b62009-01-31 16:36:08 +000013073 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013074 {NULL, NULL}
13075};
13076
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013077static PyObject *
13078unicode_mod(PyObject *v, PyObject *w)
13079{
Brian Curtindfc80e32011-08-10 20:28:54 -050013080 if (!PyUnicode_Check(v))
13081 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013082 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013083}
13084
13085static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013086 0, /*nb_add*/
13087 0, /*nb_subtract*/
13088 0, /*nb_multiply*/
13089 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013090};
13091
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013093 (lenfunc) unicode_length, /* sq_length */
13094 PyUnicode_Concat, /* sq_concat */
13095 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13096 (ssizeargfunc) unicode_getitem, /* sq_item */
13097 0, /* sq_slice */
13098 0, /* sq_ass_item */
13099 0, /* sq_ass_slice */
13100 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101};
13102
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013103static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013104unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013105{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013106 if (PyUnicode_READY(self) == -1)
13107 return NULL;
13108
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013109 if (PyIndex_Check(item)) {
13110 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013111 if (i == -1 && PyErr_Occurred())
13112 return NULL;
13113 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013114 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013115 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013116 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013117 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013118 PyObject *result;
13119 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013120 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013121 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013123 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013124 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013125 return NULL;
13126 }
13127
13128 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013129 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013131 slicelength == PyUnicode_GET_LENGTH(self)) {
13132 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013133 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013134 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013135 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013136 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013137 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013138 src_kind = PyUnicode_KIND(self);
13139 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013140 if (!PyUnicode_IS_ASCII(self)) {
13141 kind_limit = kind_maxchar_limit(src_kind);
13142 max_char = 0;
13143 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13144 ch = PyUnicode_READ(src_kind, src_data, cur);
13145 if (ch > max_char) {
13146 max_char = ch;
13147 if (max_char >= kind_limit)
13148 break;
13149 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013150 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013151 }
Victor Stinner55c99112011-10-13 01:17:06 +020013152 else
13153 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013154 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013155 if (result == NULL)
13156 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013157 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013158 dest_data = PyUnicode_DATA(result);
13159
13160 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013161 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13162 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013163 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013164 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013165 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013166 } else {
13167 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13168 return NULL;
13169 }
13170}
13171
13172static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013173 (lenfunc)unicode_length, /* mp_length */
13174 (binaryfunc)unicode_subscript, /* mp_subscript */
13175 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013176};
13177
Guido van Rossumd57fd912000-03-10 22:53:23 +000013178
Guido van Rossumd57fd912000-03-10 22:53:23 +000013179/* Helpers for PyUnicode_Format() */
13180
13181static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013182getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013183{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013184 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013185 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013186 (*p_argidx)++;
13187 if (arglen < 0)
13188 return args;
13189 else
13190 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013191 }
13192 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013193 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013194 return NULL;
13195}
13196
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013197/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198
Victor Stinnerd3f08822012-05-29 12:57:52 +020013199static int
13200formatfloat(PyObject *v, int flags, int prec, int type,
13201 PyObject **p_output, _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013203 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013204 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013205 Py_ssize_t len;
Tim Petersced69f82003-09-16 20:30:58 +000013206
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207 x = PyFloat_AsDouble(v);
13208 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013209 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013210
Guido van Rossumd57fd912000-03-10 22:53:23 +000013211 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013212 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013213
Eric Smith0923d1d2009-04-16 20:16:10 +000013214 p = PyOS_double_to_string(x, type, prec,
13215 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013216 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013217 return -1;
13218 len = strlen(p);
13219 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013220 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13221 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013222 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013223 }
Victor Stinner184252a2012-06-16 02:57:41 +020013224 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013225 writer->pos += len;
13226 }
13227 else
13228 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013229 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013230 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013231}
13232
Victor Stinnerd0880d52012-04-27 23:40:13 +020013233/* formatlong() emulates the format codes d, u, o, x and X, and
13234 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13235 * Python's regular ints.
13236 * Return value: a new PyUnicodeObject*, or NULL if error.
13237 * The output string is of the form
13238 * "-"? ("0x" | "0X")? digit+
13239 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13240 * set in flags. The case of hex digits will be correct,
13241 * There will be at least prec digits, zero-filled on the left if
13242 * necessary to get that many.
13243 * val object to be converted
13244 * flags bitmask of format flags; only F_ALT is looked at
13245 * prec minimum number of digits; 0-fill on left if needed
13246 * type a character in [duoxX]; u acts the same as d
13247 *
13248 * CAUTION: o, x and X conversions on regular ints can never
13249 * produce a '-' sign, but can for Python's unbounded ints.
13250 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013251static PyObject*
13252formatlong(PyObject *val, int flags, int prec, int type)
13253{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013254 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013255 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013256 Py_ssize_t i;
13257 int sign; /* 1 if '-', else 0 */
13258 int len; /* number of characters */
13259 Py_ssize_t llen;
13260 int numdigits; /* len == numnondigits + numdigits */
13261 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013262
Victor Stinnerd0880d52012-04-27 23:40:13 +020013263 /* Avoid exceeding SSIZE_T_MAX */
13264 if (prec > INT_MAX-3) {
13265 PyErr_SetString(PyExc_OverflowError,
13266 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013267 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013268 }
13269
13270 assert(PyLong_Check(val));
13271
13272 switch (type) {
13273 case 'd':
13274 case 'u':
13275 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013276 if (PyBool_Check(val))
13277 result = PyNumber_ToBase(val, 10);
13278 else
13279 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013280 break;
13281 case 'o':
13282 numnondigits = 2;
13283 result = PyNumber_ToBase(val, 8);
13284 break;
13285 case 'x':
13286 case 'X':
13287 numnondigits = 2;
13288 result = PyNumber_ToBase(val, 16);
13289 break;
13290 default:
13291 assert(!"'type' not in [duoxX]");
13292 }
13293 if (!result)
13294 return NULL;
13295
13296 assert(unicode_modifiable(result));
13297 assert(PyUnicode_IS_READY(result));
13298 assert(PyUnicode_IS_ASCII(result));
13299
13300 /* To modify the string in-place, there can only be one reference. */
13301 if (Py_REFCNT(result) != 1) {
13302 PyErr_BadInternalCall();
13303 return NULL;
13304 }
13305 buf = PyUnicode_DATA(result);
13306 llen = PyUnicode_GET_LENGTH(result);
13307 if (llen > INT_MAX) {
13308 PyErr_SetString(PyExc_ValueError,
13309 "string too large in _PyBytes_FormatLong");
13310 return NULL;
13311 }
13312 len = (int)llen;
13313 sign = buf[0] == '-';
13314 numnondigits += sign;
13315 numdigits = len - numnondigits;
13316 assert(numdigits > 0);
13317
13318 /* Get rid of base marker unless F_ALT */
13319 if (((flags & F_ALT) == 0 &&
13320 (type == 'o' || type == 'x' || type == 'X'))) {
13321 assert(buf[sign] == '0');
13322 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13323 buf[sign+1] == 'o');
13324 numnondigits -= 2;
13325 buf += 2;
13326 len -= 2;
13327 if (sign)
13328 buf[0] = '-';
13329 assert(len == numnondigits + numdigits);
13330 assert(numdigits > 0);
13331 }
13332
13333 /* Fill with leading zeroes to meet minimum width. */
13334 if (prec > numdigits) {
13335 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13336 numnondigits + prec);
13337 char *b1;
13338 if (!r1) {
13339 Py_DECREF(result);
13340 return NULL;
13341 }
13342 b1 = PyBytes_AS_STRING(r1);
13343 for (i = 0; i < numnondigits; ++i)
13344 *b1++ = *buf++;
13345 for (i = 0; i < prec - numdigits; i++)
13346 *b1++ = '0';
13347 for (i = 0; i < numdigits; i++)
13348 *b1++ = *buf++;
13349 *b1 = '\0';
13350 Py_DECREF(result);
13351 result = r1;
13352 buf = PyBytes_AS_STRING(result);
13353 len = numnondigits + prec;
13354 }
13355
13356 /* Fix up case for hex conversions. */
13357 if (type == 'X') {
13358 /* Need to convert all lower case letters to upper case.
13359 and need to convert 0x to 0X (and -0x to -0X). */
13360 for (i = 0; i < len; i++)
13361 if (buf[i] >= 'a' && buf[i] <= 'x')
13362 buf[i] -= 'a'-'A';
13363 }
13364 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13365 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013366 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013367 Py_DECREF(result);
13368 result = unicode;
13369 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013370 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013371}
13372
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013373static Py_UCS4
13374formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013375{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013376 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013377 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013378 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013379 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013380 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013381 goto onError;
13382 }
13383 else {
13384 /* Integer input truncated to a character */
13385 long x;
13386 x = PyLong_AsLong(v);
13387 if (x == -1 && PyErr_Occurred())
13388 goto onError;
13389
Victor Stinner8faf8212011-12-08 22:14:11 +010013390 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013391 PyErr_SetString(PyExc_OverflowError,
13392 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013393 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013394 }
13395
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013396 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013397 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013398
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013400 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013401 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013402 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013403}
13404
Alexander Belopolsky40018472011-02-26 01:02:56 +000013405PyObject *
13406PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013407{
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013408 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013409 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013410 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013411 PyObject *temp = NULL;
13412 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013413 PyObject *uformat;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013414 void *fmt;
13415 enum PyUnicode_Kind kind, fmtkind;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013416 _PyUnicodeWriter writer;
Victor Stinneree4544c2012-05-09 22:24:08 +020013417 Py_ssize_t sublen;
13418 Py_UCS4 maxchar;
Tim Petersced69f82003-09-16 20:30:58 +000013419
Guido van Rossumd57fd912000-03-10 22:53:23 +000013420 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013421 PyErr_BadInternalCall();
13422 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013423 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013424 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013425 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013426 return NULL;
Victor Stinner19294072012-10-05 00:09:33 +020013427 if (PyUnicode_READY(uformat) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013428 Py_DECREF(uformat);
Victor Stinner19294072012-10-05 00:09:33 +020013429 return NULL;
13430 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013432 fmt = PyUnicode_DATA(uformat);
13433 fmtkind = PyUnicode_KIND(uformat);
13434 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13435 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013436
Victor Stinnerd3f08822012-05-29 12:57:52 +020013437 _PyUnicodeWriter_Init(&writer, fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013438
Guido van Rossumd57fd912000-03-10 22:53:23 +000013439 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013440 arglen = PyTuple_Size(args);
13441 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013442 }
13443 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013444 arglen = -1;
13445 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013446 }
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013447 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013449
13450 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013451 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013452 Py_ssize_t nonfmtpos;
13453 nonfmtpos = fmtpos++;
13454 while (fmtcnt >= 0 &&
13455 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13456 fmtpos++;
13457 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013458 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013459 if (fmtcnt < 0)
13460 fmtpos--;
Victor Stinneree4544c2012-05-09 22:24:08 +020013461 sublen = fmtpos - nonfmtpos;
13462 maxchar = _PyUnicode_FindMaxChar(uformat,
13463 nonfmtpos, nonfmtpos + sublen);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013464 if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013465 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013466
Victor Stinnerd3f08822012-05-29 12:57:52 +020013467 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13468 uformat, nonfmtpos, sublen);
Victor Stinneree4544c2012-05-09 22:24:08 +020013469 writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013470 }
13471 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013472 /* Got a format specifier */
13473 int flags = 0;
13474 Py_ssize_t width = -1;
13475 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013476 Py_UCS4 c = '\0';
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013477 Py_UCS4 fill;
13478 int sign;
13479 Py_UCS4 signchar;
Benjamin Peterson29060642009-01-31 22:14:21 +000013480 int isnumok;
13481 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013482 void *pbuf = NULL;
13483 Py_ssize_t pindex, len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013484 Py_UCS4 bufmaxchar;
13485 Py_ssize_t buflen;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013487 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013488 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13489 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013490 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013491 Py_ssize_t keylen;
13492 PyObject *key;
13493 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013494
Benjamin Peterson29060642009-01-31 22:14:21 +000013495 if (dict == NULL) {
13496 PyErr_SetString(PyExc_TypeError,
13497 "format requires a mapping");
13498 goto onError;
13499 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013500 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013501 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013502 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013503 /* Skip over balanced parentheses */
13504 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013505 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13506 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013507 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013508 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013509 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013510 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013511 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013512 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013513 if (fmtcnt < 0 || pcount > 0) {
13514 PyErr_SetString(PyExc_ValueError,
13515 "incomplete format key");
13516 goto onError;
13517 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013518 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013519 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013520 if (key == NULL)
13521 goto onError;
13522 if (args_owned) {
13523 Py_DECREF(args);
13524 args_owned = 0;
13525 }
13526 args = PyObject_GetItem(dict, key);
13527 Py_DECREF(key);
13528 if (args == NULL) {
13529 goto onError;
13530 }
13531 args_owned = 1;
13532 arglen = -1;
13533 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013534 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013535 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013536 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13537 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013538 case '-': flags |= F_LJUST; continue;
13539 case '+': flags |= F_SIGN; continue;
13540 case ' ': flags |= F_BLANK; continue;
13541 case '#': flags |= F_ALT; continue;
13542 case '0': flags |= F_ZERO; continue;
13543 }
13544 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013545 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013546 if (c == '*') {
13547 v = getnextarg(args, arglen, &argidx);
13548 if (v == NULL)
13549 goto onError;
13550 if (!PyLong_Check(v)) {
13551 PyErr_SetString(PyExc_TypeError,
13552 "* wants int");
13553 goto onError;
13554 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +020013555 width = PyLong_AsSsize_t(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013556 if (width == -1 && PyErr_Occurred())
13557 goto onError;
13558 if (width < 0) {
13559 flags |= F_LJUST;
13560 width = -width;
13561 }
13562 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013563 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013564 }
13565 else if (c >= '0' && c <= '9') {
13566 width = c - '0';
13567 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013568 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013569 if (c < '0' || c > '9')
13570 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013571 /* Since c is unsigned, the RHS would end up as unsigned,
13572 mixing signed and unsigned comparison. Since c is between
13573 '0' and '9', casting to int is safe. */
13574 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013575 PyErr_SetString(PyExc_ValueError,
13576 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013577 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013578 }
13579 width = width*10 + (c - '0');
13580 }
13581 }
13582 if (c == '.') {
13583 prec = 0;
13584 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013585 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013586 if (c == '*') {
13587 v = getnextarg(args, arglen, &argidx);
13588 if (v == NULL)
13589 goto onError;
13590 if (!PyLong_Check(v)) {
13591 PyErr_SetString(PyExc_TypeError,
13592 "* wants int");
13593 goto onError;
13594 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +020013595 prec = _PyLong_AsInt(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013596 if (prec == -1 && PyErr_Occurred())
13597 goto onError;
13598 if (prec < 0)
13599 prec = 0;
13600 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013601 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013602 }
13603 else if (c >= '0' && c <= '9') {
13604 prec = c - '0';
13605 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013606 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013607 if (c < '0' || c > '9')
13608 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013609 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013610 PyErr_SetString(PyExc_ValueError,
13611 "prec too big");
13612 goto onError;
13613 }
13614 prec = prec*10 + (c - '0');
13615 }
13616 }
13617 } /* prec */
13618 if (fmtcnt >= 0) {
13619 if (c == 'h' || c == 'l' || c == 'L') {
13620 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013621 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013622 }
13623 }
13624 if (fmtcnt < 0) {
13625 PyErr_SetString(PyExc_ValueError,
13626 "incomplete format");
13627 goto onError;
13628 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013629 if (fmtcnt == 0)
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013630 writer.overallocate = 0;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013631
13632 if (c == '%') {
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013633 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013634 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013635 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13636 writer.pos += 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013637 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013638 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013639
Victor Stinneraff3cc62012-04-30 05:19:21 +020013640 v = getnextarg(args, arglen, &argidx);
13641 if (v == NULL)
13642 goto onError;
13643
Benjamin Peterson29060642009-01-31 22:14:21 +000013644 sign = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013645 signchar = '\0';
Benjamin Peterson29060642009-01-31 22:14:21 +000013646 fill = ' ';
13647 switch (c) {
13648
Benjamin Peterson29060642009-01-31 22:14:21 +000013649 case 's':
13650 case 'r':
13651 case 'a':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013652 if (PyLong_CheckExact(v) && width == -1 && prec == -1) {
13653 /* Fast path */
13654 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13655 goto onError;
13656 goto nextarg;
13657 }
13658
Victor Stinner808fc0a2010-03-22 12:50:40 +000013659 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013660 temp = v;
13661 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013662 }
13663 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013664 if (c == 's')
13665 temp = PyObject_Str(v);
13666 else if (c == 'r')
13667 temp = PyObject_Repr(v);
13668 else
13669 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013670 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013671 break;
13672
13673 case 'i':
13674 case 'd':
13675 case 'u':
13676 case 'o':
13677 case 'x':
13678 case 'X':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013679 if (PyLong_CheckExact(v)
13680 && width == -1 && prec == -1
13681 && !(flags & (F_SIGN | F_BLANK)))
13682 {
13683 /* Fast path */
13684 switch(c)
13685 {
13686 case 'd':
13687 case 'i':
13688 case 'u':
13689 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13690 goto onError;
13691 goto nextarg;
13692 case 'x':
13693 if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1)
13694 goto onError;
13695 goto nextarg;
13696 case 'o':
13697 if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1)
13698 goto onError;
13699 goto nextarg;
13700 default:
13701 break;
13702 }
13703 }
13704
Benjamin Peterson29060642009-01-31 22:14:21 +000013705 isnumok = 0;
13706 if (PyNumber_Check(v)) {
13707 PyObject *iobj=NULL;
13708
13709 if (PyLong_Check(v)) {
13710 iobj = v;
13711 Py_INCREF(iobj);
13712 }
13713 else {
13714 iobj = PyNumber_Long(v);
13715 }
13716 if (iobj!=NULL) {
13717 if (PyLong_Check(iobj)) {
13718 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013719 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013720 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013721 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013722 }
13723 else {
13724 Py_DECREF(iobj);
13725 }
13726 }
13727 }
13728 if (!isnumok) {
13729 PyErr_Format(PyExc_TypeError,
13730 "%%%c format: a number is required, "
13731 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13732 goto onError;
13733 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013734 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013735 fill = '0';
13736 break;
13737
13738 case 'e':
13739 case 'E':
13740 case 'f':
13741 case 'F':
13742 case 'g':
13743 case 'G':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013744 if (width == -1 && prec == -1
13745 && !(flags & (F_SIGN | F_BLANK)))
13746 {
13747 /* Fast path */
13748 if (formatfloat(v, flags, prec, c, NULL, &writer) == -1)
13749 goto onError;
13750 goto nextarg;
13751 }
13752
Benjamin Peterson29060642009-01-31 22:14:21 +000013753 sign = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013754 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013755 fill = '0';
Victor Stinnerd3f08822012-05-29 12:57:52 +020013756 if (formatfloat(v, flags, prec, c, &temp, NULL) == -1)
13757 temp = NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000013758 break;
13759
13760 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013761 {
13762 Py_UCS4 ch = formatchar(v);
13763 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013764 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013765 if (width == -1 && prec == -1) {
13766 /* Fast path */
13767 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
13768 goto onError;
13769 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
13770 writer.pos += 1;
13771 goto nextarg;
13772 }
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013773 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000013774 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013775 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013776
13777 default:
13778 PyErr_Format(PyExc_ValueError,
13779 "unsupported format character '%c' (0x%x) "
13780 "at index %zd",
13781 (31<=c && c<=126) ? (char)c : '?',
13782 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013783 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013784 goto onError;
13785 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013786 if (temp == NULL)
13787 goto onError;
13788 assert (PyUnicode_Check(temp));
Victor Stinnerd3f08822012-05-29 12:57:52 +020013789
13790 if (width == -1 && prec == -1
13791 && !(flags & (F_SIGN | F_BLANK)))
13792 {
13793 /* Fast path */
13794 if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1)
13795 goto onError;
13796 goto nextarg;
13797 }
13798
Victor Stinneraff3cc62012-04-30 05:19:21 +020013799 if (PyUnicode_READY(temp) == -1) {
13800 Py_CLEAR(temp);
13801 goto onError;
13802 }
13803 kind = PyUnicode_KIND(temp);
13804 pbuf = PyUnicode_DATA(temp);
13805 len = PyUnicode_GET_LENGTH(temp);
13806
13807 if (c == 's' || c == 'r' || c == 'a') {
13808 if (prec >= 0 && len > prec)
13809 len = prec;
13810 }
13811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013812 /* pbuf is initialized here. */
13813 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013814 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013815 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13816 if (ch == '-' || ch == '+') {
13817 signchar = ch;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013818 len--;
13819 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013820 }
13821 else if (flags & F_SIGN)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013822 signchar = '+';
Benjamin Peterson29060642009-01-31 22:14:21 +000013823 else if (flags & F_BLANK)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013824 signchar = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +000013825 else
13826 sign = 0;
13827 }
13828 if (width < len)
13829 width = len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013830
13831 /* Compute the length and maximum character of the
13832 written characters */
13833 bufmaxchar = 127;
13834 if (!(flags & F_LJUST)) {
13835 if (sign) {
13836 if ((width-1) > len)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013837 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013838 }
13839 else {
13840 if (width > len)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013841 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013842 }
13843 }
13844 maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013845 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
Victor Stinneree4544c2012-05-09 22:24:08 +020013846
13847 buflen = width;
13848 if (sign && len == width)
13849 buflen++;
13850
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013851 if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
Victor Stinneree4544c2012-05-09 22:24:08 +020013852 goto onError;
13853
13854 /* Write characters */
Benjamin Peterson29060642009-01-31 22:14:21 +000013855 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013856 if (fill != ' ') {
Victor Stinneree4544c2012-05-09 22:24:08 +020013857 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13858 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013859 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013860 if (width > len)
13861 width--;
13862 }
13863 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013864 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013865 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013866 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013867 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13868 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13869 writer.pos += 2;
13870 pindex += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +000013871 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013872 width -= 2;
13873 if (width < 0)
13874 width = 0;
13875 len -= 2;
13876 }
13877 if (width > len && !(flags & F_LJUST)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013878 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013879 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
13880 writer.pos += sublen;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013881 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013882 }
13883 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013884 if (sign) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013885 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13886 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013887 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013888 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013889 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13890 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013891 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13892 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13893 writer.pos += 2;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013894 pindex += 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013895 }
13896 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013897
Victor Stinnerc9d369f2012-06-16 02:22:37 +020013898 if (len) {
13899 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13900 temp, pindex, len);
13901 writer.pos += len;
13902 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013903 if (width > len) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013904 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013905 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
13906 writer.pos += sublen;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013907 }
Victor Stinneree4544c2012-05-09 22:24:08 +020013908
Victor Stinnerd3f08822012-05-29 12:57:52 +020013909nextarg:
Benjamin Peterson29060642009-01-31 22:14:21 +000013910 if (dict && (argidx < arglen) && c != '%') {
13911 PyErr_SetString(PyExc_TypeError,
13912 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013913 goto onError;
13914 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013915 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013916 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013917 } /* until end */
13918 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013919 PyErr_SetString(PyExc_TypeError,
13920 "not all arguments converted during string formatting");
13921 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013922 }
13923
13924 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013925 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013926 }
13927 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013928 Py_XDECREF(temp);
13929 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013930 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013931
Benjamin Peterson29060642009-01-31 22:14:21 +000013932 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013933 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013934 Py_XDECREF(temp);
13935 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013936 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013937 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013938 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013939 }
13940 return NULL;
13941}
13942
Jeremy Hylton938ace62002-07-17 16:30:39 +000013943static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013944unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13945
Tim Peters6d6c1a32001-08-02 04:15:00 +000013946static PyObject *
13947unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13948{
Benjamin Peterson29060642009-01-31 22:14:21 +000013949 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013950 static char *kwlist[] = {"object", "encoding", "errors", 0};
13951 char *encoding = NULL;
13952 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013953
Benjamin Peterson14339b62009-01-31 16:36:08 +000013954 if (type != &PyUnicode_Type)
13955 return unicode_subtype_new(type, args, kwds);
13956 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013957 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013958 return NULL;
13959 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020013960 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000013961 if (encoding == NULL && errors == NULL)
13962 return PyObject_Str(x);
13963 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013964 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013965}
13966
Guido van Rossume023fe02001-08-30 03:12:59 +000013967static PyObject *
13968unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13969{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013970 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013971 Py_ssize_t length, char_size;
13972 int share_wstr, share_utf8;
13973 unsigned int kind;
13974 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013975
Benjamin Peterson14339b62009-01-31 16:36:08 +000013976 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013977
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013978 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013979 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013980 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013981 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050013982 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013983 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013984 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013985 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013986
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013987 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013988 if (self == NULL) {
13989 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013990 return NULL;
13991 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013992 kind = PyUnicode_KIND(unicode);
13993 length = PyUnicode_GET_LENGTH(unicode);
13994
13995 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013996#ifdef Py_DEBUG
13997 _PyUnicode_HASH(self) = -1;
13998#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013999 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014000#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014001 _PyUnicode_STATE(self).interned = 0;
14002 _PyUnicode_STATE(self).kind = kind;
14003 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014004 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014005 _PyUnicode_STATE(self).ready = 1;
14006 _PyUnicode_WSTR(self) = NULL;
14007 _PyUnicode_UTF8_LENGTH(self) = 0;
14008 _PyUnicode_UTF8(self) = NULL;
14009 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014010 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014011
14012 share_utf8 = 0;
14013 share_wstr = 0;
14014 if (kind == PyUnicode_1BYTE_KIND) {
14015 char_size = 1;
14016 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14017 share_utf8 = 1;
14018 }
14019 else if (kind == PyUnicode_2BYTE_KIND) {
14020 char_size = 2;
14021 if (sizeof(wchar_t) == 2)
14022 share_wstr = 1;
14023 }
14024 else {
14025 assert(kind == PyUnicode_4BYTE_KIND);
14026 char_size = 4;
14027 if (sizeof(wchar_t) == 4)
14028 share_wstr = 1;
14029 }
14030
14031 /* Ensure we won't overflow the length. */
14032 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14033 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014034 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014035 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014036 data = PyObject_MALLOC((length + 1) * char_size);
14037 if (data == NULL) {
14038 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014039 goto onError;
14040 }
14041
Victor Stinnerc3c74152011-10-02 20:39:55 +020014042 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014043 if (share_utf8) {
14044 _PyUnicode_UTF8_LENGTH(self) = length;
14045 _PyUnicode_UTF8(self) = data;
14046 }
14047 if (share_wstr) {
14048 _PyUnicode_WSTR_LENGTH(self) = length;
14049 _PyUnicode_WSTR(self) = (wchar_t *)data;
14050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014051
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014052 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014053 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014054 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014055#ifdef Py_DEBUG
14056 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14057#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014058 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014059 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014060
14061onError:
14062 Py_DECREF(unicode);
14063 Py_DECREF(self);
14064 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014065}
14066
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014067PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014068"str(object='') -> str\n\
14069str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014070\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014071Create a new string object from the given object. If encoding or\n\
14072errors is specified, then the object must expose a data buffer\n\
14073that will be decoded using the given encoding and error handler.\n\
14074Otherwise, returns the result of object.__str__() (if defined)\n\
14075or repr(object).\n\
14076encoding defaults to sys.getdefaultencoding().\n\
14077errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014078
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014079static PyObject *unicode_iter(PyObject *seq);
14080
Guido van Rossumd57fd912000-03-10 22:53:23 +000014081PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014082 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014083 "str", /* tp_name */
14084 sizeof(PyUnicodeObject), /* tp_size */
14085 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014086 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014087 (destructor)unicode_dealloc, /* tp_dealloc */
14088 0, /* tp_print */
14089 0, /* tp_getattr */
14090 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014091 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014092 unicode_repr, /* tp_repr */
14093 &unicode_as_number, /* tp_as_number */
14094 &unicode_as_sequence, /* tp_as_sequence */
14095 &unicode_as_mapping, /* tp_as_mapping */
14096 (hashfunc) unicode_hash, /* tp_hash*/
14097 0, /* tp_call*/
14098 (reprfunc) unicode_str, /* tp_str */
14099 PyObject_GenericGetAttr, /* tp_getattro */
14100 0, /* tp_setattro */
14101 0, /* tp_as_buffer */
14102 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014103 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014104 unicode_doc, /* tp_doc */
14105 0, /* tp_traverse */
14106 0, /* tp_clear */
14107 PyUnicode_RichCompare, /* tp_richcompare */
14108 0, /* tp_weaklistoffset */
14109 unicode_iter, /* tp_iter */
14110 0, /* tp_iternext */
14111 unicode_methods, /* tp_methods */
14112 0, /* tp_members */
14113 0, /* tp_getset */
14114 &PyBaseObject_Type, /* tp_base */
14115 0, /* tp_dict */
14116 0, /* tp_descr_get */
14117 0, /* tp_descr_set */
14118 0, /* tp_dictoffset */
14119 0, /* tp_init */
14120 0, /* tp_alloc */
14121 unicode_new, /* tp_new */
14122 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014123};
14124
14125/* Initialize the Unicode implementation */
14126
Victor Stinner3a50e702011-10-18 21:21:00 +020014127int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014128{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014129 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014130 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014131 0x000A, /* LINE FEED */
14132 0x000D, /* CARRIAGE RETURN */
14133 0x001C, /* FILE SEPARATOR */
14134 0x001D, /* GROUP SEPARATOR */
14135 0x001E, /* RECORD SEPARATOR */
14136 0x0085, /* NEXT LINE */
14137 0x2028, /* LINE SEPARATOR */
14138 0x2029, /* PARAGRAPH SEPARATOR */
14139 };
14140
Fred Drakee4315f52000-05-09 19:53:39 +000014141 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014142 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014143 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014144 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014145 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014146
Guido van Rossumcacfc072002-05-24 19:01:59 +000014147 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014148 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014149
14150 /* initialize the linebreak bloom filter */
14151 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014152 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014153 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014154
14155 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014156
Benjamin Petersonc4311282012-10-30 23:21:10 -040014157 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14158 Py_FatalError("Can't initialize field name iterator type");
14159
14160 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14161 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014162
Victor Stinner3a50e702011-10-18 21:21:00 +020014163#ifdef HAVE_MBCS
14164 winver.dwOSVersionInfoSize = sizeof(winver);
14165 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14166 PyErr_SetFromWindowsErr(0);
14167 return -1;
14168 }
14169#endif
14170 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014171}
14172
14173/* Finalize the Unicode implementation */
14174
Christian Heimesa156e092008-02-16 07:38:31 +000014175int
14176PyUnicode_ClearFreeList(void)
14177{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014178 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014179}
14180
Guido van Rossumd57fd912000-03-10 22:53:23 +000014181void
Thomas Wouters78890102000-07-22 19:25:51 +000014182_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014183{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014184 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014185
Serhiy Storchaka05997252013-01-26 12:14:02 +020014186 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014187
Serhiy Storchaka05997252013-01-26 12:14:02 +020014188 for (i = 0; i < 256; i++)
14189 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014190 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014191 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014192}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014193
Walter Dörwald16807132007-05-25 13:52:07 +000014194void
14195PyUnicode_InternInPlace(PyObject **p)
14196{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014197 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014198 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014199#ifdef Py_DEBUG
14200 assert(s != NULL);
14201 assert(_PyUnicode_CHECK(s));
14202#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014203 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014204 return;
14205#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014206 /* If it's a subclass, we don't really know what putting
14207 it in the interned dict might do. */
14208 if (!PyUnicode_CheckExact(s))
14209 return;
14210 if (PyUnicode_CHECK_INTERNED(s))
14211 return;
14212 if (interned == NULL) {
14213 interned = PyDict_New();
14214 if (interned == NULL) {
14215 PyErr_Clear(); /* Don't leave an exception */
14216 return;
14217 }
14218 }
14219 /* It might be that the GetItem call fails even
14220 though the key is present in the dictionary,
14221 namely when this happens during a stack overflow. */
14222 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014223 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014224 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014225
Benjamin Peterson29060642009-01-31 22:14:21 +000014226 if (t) {
14227 Py_INCREF(t);
14228 Py_DECREF(*p);
14229 *p = t;
14230 return;
14231 }
Walter Dörwald16807132007-05-25 13:52:07 +000014232
Benjamin Peterson14339b62009-01-31 16:36:08 +000014233 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014234 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014235 PyErr_Clear();
14236 PyThreadState_GET()->recursion_critical = 0;
14237 return;
14238 }
14239 PyThreadState_GET()->recursion_critical = 0;
14240 /* The two references in interned are not counted by refcnt.
14241 The deallocator will take care of this */
14242 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014243 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014244}
14245
14246void
14247PyUnicode_InternImmortal(PyObject **p)
14248{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014249 PyUnicode_InternInPlace(p);
14250 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014251 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014252 Py_INCREF(*p);
14253 }
Walter Dörwald16807132007-05-25 13:52:07 +000014254}
14255
14256PyObject *
14257PyUnicode_InternFromString(const char *cp)
14258{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014259 PyObject *s = PyUnicode_FromString(cp);
14260 if (s == NULL)
14261 return NULL;
14262 PyUnicode_InternInPlace(&s);
14263 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014264}
14265
Alexander Belopolsky40018472011-02-26 01:02:56 +000014266void
14267_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014268{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014269 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014270 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014271 Py_ssize_t i, n;
14272 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014273
Benjamin Peterson14339b62009-01-31 16:36:08 +000014274 if (interned == NULL || !PyDict_Check(interned))
14275 return;
14276 keys = PyDict_Keys(interned);
14277 if (keys == NULL || !PyList_Check(keys)) {
14278 PyErr_Clear();
14279 return;
14280 }
Walter Dörwald16807132007-05-25 13:52:07 +000014281
Benjamin Peterson14339b62009-01-31 16:36:08 +000014282 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14283 detector, interned unicode strings are not forcibly deallocated;
14284 rather, we give them their stolen references back, and then clear
14285 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014286
Benjamin Peterson14339b62009-01-31 16:36:08 +000014287 n = PyList_GET_SIZE(keys);
14288 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014289 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014290 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014291 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014292 if (PyUnicode_READY(s) == -1) {
14293 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014294 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014295 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014296 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014297 case SSTATE_NOT_INTERNED:
14298 /* XXX Shouldn't happen */
14299 break;
14300 case SSTATE_INTERNED_IMMORTAL:
14301 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014302 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014303 break;
14304 case SSTATE_INTERNED_MORTAL:
14305 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014306 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014307 break;
14308 default:
14309 Py_FatalError("Inconsistent interned string state.");
14310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014311 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014312 }
14313 fprintf(stderr, "total size of all interned strings: "
14314 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14315 "mortal/immortal\n", mortal_size, immortal_size);
14316 Py_DECREF(keys);
14317 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014318 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014319}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014320
14321
14322/********************* Unicode Iterator **************************/
14323
14324typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014325 PyObject_HEAD
14326 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014327 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014328} unicodeiterobject;
14329
14330static void
14331unicodeiter_dealloc(unicodeiterobject *it)
14332{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014333 _PyObject_GC_UNTRACK(it);
14334 Py_XDECREF(it->it_seq);
14335 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014336}
14337
14338static int
14339unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14340{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014341 Py_VISIT(it->it_seq);
14342 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014343}
14344
14345static PyObject *
14346unicodeiter_next(unicodeiterobject *it)
14347{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014348 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014349
Benjamin Peterson14339b62009-01-31 16:36:08 +000014350 assert(it != NULL);
14351 seq = it->it_seq;
14352 if (seq == NULL)
14353 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014354 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014356 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14357 int kind = PyUnicode_KIND(seq);
14358 void *data = PyUnicode_DATA(seq);
14359 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14360 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014361 if (item != NULL)
14362 ++it->it_index;
14363 return item;
14364 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014365
Benjamin Peterson14339b62009-01-31 16:36:08 +000014366 Py_DECREF(seq);
14367 it->it_seq = NULL;
14368 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014369}
14370
14371static PyObject *
14372unicodeiter_len(unicodeiterobject *it)
14373{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014374 Py_ssize_t len = 0;
14375 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014376 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014377 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014378}
14379
14380PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14381
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014382static PyObject *
14383unicodeiter_reduce(unicodeiterobject *it)
14384{
14385 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014386 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014387 it->it_seq, it->it_index);
14388 } else {
14389 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14390 if (u == NULL)
14391 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014392 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014393 }
14394}
14395
14396PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14397
14398static PyObject *
14399unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14400{
14401 Py_ssize_t index = PyLong_AsSsize_t(state);
14402 if (index == -1 && PyErr_Occurred())
14403 return NULL;
14404 if (index < 0)
14405 index = 0;
14406 it->it_index = index;
14407 Py_RETURN_NONE;
14408}
14409
14410PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14411
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014412static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014413 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014414 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014415 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14416 reduce_doc},
14417 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14418 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014419 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014420};
14421
14422PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014423 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14424 "str_iterator", /* tp_name */
14425 sizeof(unicodeiterobject), /* tp_basicsize */
14426 0, /* tp_itemsize */
14427 /* methods */
14428 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14429 0, /* tp_print */
14430 0, /* tp_getattr */
14431 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014432 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014433 0, /* tp_repr */
14434 0, /* tp_as_number */
14435 0, /* tp_as_sequence */
14436 0, /* tp_as_mapping */
14437 0, /* tp_hash */
14438 0, /* tp_call */
14439 0, /* tp_str */
14440 PyObject_GenericGetAttr, /* tp_getattro */
14441 0, /* tp_setattro */
14442 0, /* tp_as_buffer */
14443 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14444 0, /* tp_doc */
14445 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14446 0, /* tp_clear */
14447 0, /* tp_richcompare */
14448 0, /* tp_weaklistoffset */
14449 PyObject_SelfIter, /* tp_iter */
14450 (iternextfunc)unicodeiter_next, /* tp_iternext */
14451 unicodeiter_methods, /* tp_methods */
14452 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014453};
14454
14455static PyObject *
14456unicode_iter(PyObject *seq)
14457{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014458 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014459
Benjamin Peterson14339b62009-01-31 16:36:08 +000014460 if (!PyUnicode_Check(seq)) {
14461 PyErr_BadInternalCall();
14462 return NULL;
14463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014464 if (PyUnicode_READY(seq) == -1)
14465 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014466 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14467 if (it == NULL)
14468 return NULL;
14469 it->it_index = 0;
14470 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014471 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014472 _PyObject_GC_TRACK(it);
14473 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014474}
14475
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014476
14477size_t
14478Py_UNICODE_strlen(const Py_UNICODE *u)
14479{
14480 int res = 0;
14481 while(*u++)
14482 res++;
14483 return res;
14484}
14485
14486Py_UNICODE*
14487Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14488{
14489 Py_UNICODE *u = s1;
14490 while ((*u++ = *s2++));
14491 return s1;
14492}
14493
14494Py_UNICODE*
14495Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14496{
14497 Py_UNICODE *u = s1;
14498 while ((*u++ = *s2++))
14499 if (n-- == 0)
14500 break;
14501 return s1;
14502}
14503
14504Py_UNICODE*
14505Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14506{
14507 Py_UNICODE *u1 = s1;
14508 u1 += Py_UNICODE_strlen(u1);
14509 Py_UNICODE_strcpy(u1, s2);
14510 return s1;
14511}
14512
14513int
14514Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14515{
14516 while (*s1 && *s2 && *s1 == *s2)
14517 s1++, s2++;
14518 if (*s1 && *s2)
14519 return (*s1 < *s2) ? -1 : +1;
14520 if (*s1)
14521 return 1;
14522 if (*s2)
14523 return -1;
14524 return 0;
14525}
14526
14527int
14528Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14529{
14530 register Py_UNICODE u1, u2;
14531 for (; n != 0; n--) {
14532 u1 = *s1;
14533 u2 = *s2;
14534 if (u1 != u2)
14535 return (u1 < u2) ? -1 : +1;
14536 if (u1 == '\0')
14537 return 0;
14538 s1++;
14539 s2++;
14540 }
14541 return 0;
14542}
14543
14544Py_UNICODE*
14545Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14546{
14547 const Py_UNICODE *p;
14548 for (p = s; *p; p++)
14549 if (*p == c)
14550 return (Py_UNICODE*)p;
14551 return NULL;
14552}
14553
14554Py_UNICODE*
14555Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14556{
14557 const Py_UNICODE *p;
14558 p = s + Py_UNICODE_strlen(s);
14559 while (p != s) {
14560 p--;
14561 if (*p == c)
14562 return (Py_UNICODE*)p;
14563 }
14564 return NULL;
14565}
Victor Stinner331ea922010-08-10 16:37:20 +000014566
Victor Stinner71133ff2010-09-01 23:43:53 +000014567Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014568PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014569{
Victor Stinner577db2c2011-10-11 22:12:48 +020014570 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014571 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014573 if (!PyUnicode_Check(unicode)) {
14574 PyErr_BadArgument();
14575 return NULL;
14576 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014577 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014578 if (u == NULL)
14579 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014580 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014581 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014582 PyErr_NoMemory();
14583 return NULL;
14584 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014585 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014586 size *= sizeof(Py_UNICODE);
14587 copy = PyMem_Malloc(size);
14588 if (copy == NULL) {
14589 PyErr_NoMemory();
14590 return NULL;
14591 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014592 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014593 return copy;
14594}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014595
Georg Brandl66c221e2010-10-14 07:04:07 +000014596/* A _string module, to export formatter_parser and formatter_field_name_split
14597 to the string.Formatter class implemented in Python. */
14598
14599static PyMethodDef _string_methods[] = {
14600 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14601 METH_O, PyDoc_STR("split the argument as a field name")},
14602 {"formatter_parser", (PyCFunction) formatter_parser,
14603 METH_O, PyDoc_STR("parse the argument as a format string")},
14604 {NULL, NULL}
14605};
14606
14607static struct PyModuleDef _string_module = {
14608 PyModuleDef_HEAD_INIT,
14609 "_string",
14610 PyDoc_STR("string helper module"),
14611 0,
14612 _string_methods,
14613 NULL,
14614 NULL,
14615 NULL,
14616 NULL
14617};
14618
14619PyMODINIT_FUNC
14620PyInit__string(void)
14621{
14622 return PyModule_Create(&_string_module);
14623}
14624
14625
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014626#ifdef __cplusplus
14627}
14628#endif