blob: c21e80c99d2e7878e3698c16d8a13a6d0578dee3 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Guido van Rossumd57fd912000-03-10 22:53:23 +000050/* Endianness switches; defaults to little endian */
51
52#ifdef WORDS_BIGENDIAN
53# define BYTEORDER_IS_BIG_ENDIAN
54#else
55# define BYTEORDER_IS_LITTLE_ENDIAN
56#endif
57
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000058/* --- Globals ------------------------------------------------------------
59
Serhiy Storchaka05997252013-01-26 12:14:02 +020060NOTE: In the interpreter's initialization phase, some globals are currently
61 initialized dynamically as needed. In the process Unicode objects may
62 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000063
64*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000065
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000066
67#ifdef __cplusplus
68extern "C" {
69#endif
70
Victor Stinner8faf8212011-12-08 22:14:11 +010071/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
72#define MAX_UNICODE 0x10ffff
73
Victor Stinner910337b2011-10-03 03:20:16 +020074#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020075# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020076#else
77# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
78#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020079
Victor Stinnere90fe6a2011-10-01 16:48:13 +020080#define _PyUnicode_UTF8(op) \
81 (((PyCompactUnicodeObject*)(op))->utf8)
82#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020083 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020084 assert(PyUnicode_IS_READY(op)), \
85 PyUnicode_IS_COMPACT_ASCII(op) ? \
86 ((char*)((PyASCIIObject*)(op) + 1)) : \
87 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020088#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 (((PyCompactUnicodeObject*)(op))->utf8_length)
90#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020091 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020092 assert(PyUnicode_IS_READY(op)), \
93 PyUnicode_IS_COMPACT_ASCII(op) ? \
94 ((PyASCIIObject*)(op))->length : \
95 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020096#define _PyUnicode_WSTR(op) \
97 (((PyASCIIObject*)(op))->wstr)
98#define _PyUnicode_WSTR_LENGTH(op) \
99 (((PyCompactUnicodeObject*)(op))->wstr_length)
100#define _PyUnicode_LENGTH(op) \
101 (((PyASCIIObject *)(op))->length)
102#define _PyUnicode_STATE(op) \
103 (((PyASCIIObject *)(op))->state)
104#define _PyUnicode_HASH(op) \
105 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_KIND(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200109#define _PyUnicode_GET_LENGTH(op) \
110 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200112#define _PyUnicode_DATA_ANY(op) \
113 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200114
Victor Stinnere6abb482012-05-02 01:15:40 +0200115/* Optimized version of Py_MAX() to compute the maximum character:
116 use it when your are computing the second argument of PyUnicode_New() */
117#define MAX_MAXCHAR(maxchar1, maxchar2) \
118 ((maxchar1) | (maxchar2))
119
Victor Stinner910337b2011-10-03 03:20:16 +0200120#undef PyUnicode_READY
121#define PyUnicode_READY(op) \
122 (assert(_PyUnicode_CHECK(op)), \
123 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200124 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100125 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200126
Victor Stinnerc379ead2011-10-03 12:52:27 +0200127#define _PyUnicode_SHARE_UTF8(op) \
128 (assert(_PyUnicode_CHECK(op)), \
129 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
130 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
131#define _PyUnicode_SHARE_WSTR(op) \
132 (assert(_PyUnicode_CHECK(op)), \
133 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
134
Victor Stinner829c0ad2011-10-03 01:08:02 +0200135/* true if the Unicode object has an allocated UTF-8 memory block
136 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200137#define _PyUnicode_HAS_UTF8_MEMORY(op) \
138 (assert(_PyUnicode_CHECK(op)), \
139 (!PyUnicode_IS_COMPACT_ASCII(op) \
140 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200141 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
142
Victor Stinner03490912011-10-03 23:45:12 +0200143/* true if the Unicode object has an allocated wstr memory block
144 (not shared with other data) */
145#define _PyUnicode_HAS_WSTR_MEMORY(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 (_PyUnicode_WSTR(op) && \
148 (!PyUnicode_IS_READY(op) || \
149 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
150
Victor Stinner910337b2011-10-03 03:20:16 +0200151/* Generic helper macro to convert characters of different types.
152 from_type and to_type have to be valid type names, begin and end
153 are pointers to the source characters which should be of type
154 "from_type *". to is a pointer of type "to_type *" and points to the
155 buffer where the result characters are written to. */
156#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
157 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200158 to_type *_to = (to_type *) to; \
159 const from_type *_iter = (begin); \
160 const from_type *_end = (end); \
161 Py_ssize_t n = (_end) - (_iter); \
162 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200163 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200164 while (_iter < (_unrolled_end)) { \
165 _to[0] = (to_type) _iter[0]; \
166 _to[1] = (to_type) _iter[1]; \
167 _to[2] = (to_type) _iter[2]; \
168 _to[3] = (to_type) _iter[3]; \
169 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200170 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200171 while (_iter < (_end)) \
172 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200173 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200174
Walter Dörwald16807132007-05-25 13:52:07 +0000175/* This dictionary holds all interned unicode strings. Note that references
176 to strings in this dictionary are *not* counted in the string's ob_refcnt.
177 When the interned string reaches a refcnt of 0 the string deallocation
178 function will delete the reference from this dictionary.
179
180 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000181 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000182*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200183static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000184
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000185/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200187
Serhiy Storchaka678db842013-01-26 12:16:36 +0200188#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 do { \
190 if (unicode_empty != NULL) \
191 Py_INCREF(unicode_empty); \
192 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200193 unicode_empty = PyUnicode_New(0, 0); \
194 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200195 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200196 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
197 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200198 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200199 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000200
Serhiy Storchaka678db842013-01-26 12:16:36 +0200201#define _Py_RETURN_UNICODE_EMPTY() \
202 do { \
203 _Py_INCREF_UNICODE_EMPTY(); \
204 return unicode_empty; \
205 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000206
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200207/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200208static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200209
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210/* Single character Unicode strings in the Latin-1 range are being
211 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000213
Christian Heimes190d79e2008-01-30 11:58:22 +0000214/* Fast detection of the most frequent whitespace characters */
215const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000217/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000219/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000220/* case 0x000C: * FORM FEED */
221/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 1, 1, 1, 1, 1, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000224/* case 0x001C: * FILE SEPARATOR */
225/* case 0x001D: * GROUP SEPARATOR */
226/* case 0x001E: * RECORD SEPARATOR */
227/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000228 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000229/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000230 1, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000234
Benjamin Peterson14339b62009-01-31 16:36:08 +0000235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
240 0, 0, 0, 0, 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000243};
244
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200245/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200246static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200247static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100248static int unicode_modifiable(PyObject *unicode);
249
Victor Stinnerfe226c02011-10-03 03:52:20 +0200250
Alexander Belopolsky40018472011-02-26 01:02:56 +0000251static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100252_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200253static PyObject *
254_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
255static PyObject *
256_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
257
258static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000259unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000260 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100261 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000262 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
263
Alexander Belopolsky40018472011-02-26 01:02:56 +0000264static void
265raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300266 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100267 PyObject *unicode,
268 Py_ssize_t startpos, Py_ssize_t endpos,
269 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000270
Christian Heimes190d79e2008-01-30 11:58:22 +0000271/* Same for linebreaks */
272static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000275/* 0x000B, * LINE TABULATION */
276/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000277/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000278 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000279 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000280/* 0x001C, * FILE SEPARATOR */
281/* 0x001D, * GROUP SEPARATOR */
282/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 1, 1, 1, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000288
Benjamin Peterson14339b62009-01-31 16:36:08 +0000289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000297};
298
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300299/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
300 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000301Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000302PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000303{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000304#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000305 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000306#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 /* This is actually an illegal character, so it should
308 not be passed to unichr. */
309 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000310#endif
311}
312
Victor Stinner910337b2011-10-03 03:20:16 +0200313#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200314int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100315_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200316{
317 PyASCIIObject *ascii;
318 unsigned int kind;
319
320 assert(PyUnicode_Check(op));
321
322 ascii = (PyASCIIObject *)op;
323 kind = ascii->state.kind;
324
Victor Stinnera3b334d2011-10-03 13:53:37 +0200325 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200326 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200327 assert(ascii->state.ready == 1);
328 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200330 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200331 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200332
Victor Stinnera41463c2011-10-04 01:05:08 +0200333 if (ascii->state.compact == 1) {
334 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(kind == PyUnicode_1BYTE_KIND
336 || kind == PyUnicode_2BYTE_KIND
337 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200338 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200339 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200340 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 }
342 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
344
345 data = unicode->data.any;
346 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100347 assert(ascii->length == 0);
348 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200349 assert(ascii->state.compact == 0);
350 assert(ascii->state.ascii == 0);
351 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100352 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200353 assert(ascii->wstr != NULL);
354 assert(data == NULL);
355 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200356 }
357 else {
358 assert(kind == PyUnicode_1BYTE_KIND
359 || kind == PyUnicode_2BYTE_KIND
360 || kind == PyUnicode_4BYTE_KIND);
361 assert(ascii->state.compact == 0);
362 assert(ascii->state.ready == 1);
363 assert(data != NULL);
364 if (ascii->state.ascii) {
365 assert (compact->utf8 == data);
366 assert (compact->utf8_length == ascii->length);
367 }
368 else
369 assert (compact->utf8 != data);
370 }
371 }
372 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200373 if (
374#if SIZEOF_WCHAR_T == 2
375 kind == PyUnicode_2BYTE_KIND
376#else
377 kind == PyUnicode_4BYTE_KIND
378#endif
379 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200380 {
381 assert(ascii->wstr == data);
382 assert(compact->wstr_length == ascii->length);
383 } else
384 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200385 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200386
387 if (compact->utf8 == NULL)
388 assert(compact->utf8_length == 0);
389 if (ascii->wstr == NULL)
390 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200391 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200392 /* check that the best kind is used */
393 if (check_content && kind != PyUnicode_WCHAR_KIND)
394 {
395 Py_ssize_t i;
396 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200397 void *data;
398 Py_UCS4 ch;
399
400 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 for (i=0; i < ascii->length; i++)
402 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200403 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200404 if (ch > maxchar)
405 maxchar = ch;
406 }
407 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100408 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200409 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100410 assert(maxchar <= 255);
411 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 else
413 assert(maxchar < 128);
414 }
Victor Stinner77faf692011-11-20 18:56:05 +0100415 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100417 assert(maxchar <= 0xFFFF);
418 }
419 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100421 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100422 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200423 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200424 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400425 return 1;
426}
Victor Stinner910337b2011-10-03 03:20:16 +0200427#endif
428
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429static PyObject*
430unicode_result_wchar(PyObject *unicode)
431{
432#ifndef Py_DEBUG
433 Py_ssize_t len;
434
435 assert(Py_REFCNT(unicode) == 1);
436
437 len = _PyUnicode_WSTR_LENGTH(unicode);
438 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100439 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200440 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100441 }
442
443 if (len == 1) {
444 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100445 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
447 Py_DECREF(unicode);
448 return latin1_char;
449 }
450 }
451
452 if (_PyUnicode_Ready(unicode) < 0) {
453 Py_XDECREF(unicode);
454 return NULL;
455 }
456#else
457 /* don't make the result ready in debug mode to ensure that the caller
458 makes the string ready before using it */
459 assert(_PyUnicode_CheckConsistency(unicode, 1));
460#endif
461 return unicode;
462}
463
464static PyObject*
465unicode_result_ready(PyObject *unicode)
466{
467 Py_ssize_t length;
468
469 length = PyUnicode_GET_LENGTH(unicode);
470 if (length == 0) {
471 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100472 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200473 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100474 }
475 return unicode_empty;
476 }
477
478 if (length == 1) {
479 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
480 if (ch < 256) {
481 PyObject *latin1_char = unicode_latin1[ch];
482 if (latin1_char != NULL) {
483 if (unicode != latin1_char) {
484 Py_INCREF(latin1_char);
485 Py_DECREF(unicode);
486 }
487 return latin1_char;
488 }
489 else {
490 assert(_PyUnicode_CheckConsistency(unicode, 1));
491 Py_INCREF(unicode);
492 unicode_latin1[ch] = unicode;
493 return unicode;
494 }
495 }
496 }
497
498 assert(_PyUnicode_CheckConsistency(unicode, 1));
499 return unicode;
500}
501
502static PyObject*
503unicode_result(PyObject *unicode)
504{
505 assert(_PyUnicode_CHECK(unicode));
506 if (PyUnicode_IS_READY(unicode))
507 return unicode_result_ready(unicode);
508 else
509 return unicode_result_wchar(unicode);
510}
511
Victor Stinnerc4b49542011-12-11 22:44:26 +0100512static PyObject*
513unicode_result_unchanged(PyObject *unicode)
514{
515 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500516 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100517 return NULL;
518 Py_INCREF(unicode);
519 return unicode;
520 }
521 else
522 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100523 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100524}
525
Victor Stinner3a50e702011-10-18 21:21:00 +0200526#ifdef HAVE_MBCS
527static OSVERSIONINFOEX winver;
528#endif
529
Thomas Wouters477c8d52006-05-27 19:21:47 +0000530/* --- Bloom Filters ----------------------------------------------------- */
531
532/* stuff to implement simple "bloom filters" for Unicode characters.
533 to keep things simple, we use a single bitmask, using the least 5
534 bits from each unicode characters as the bit index. */
535
536/* the linebreak mask is set up by Unicode_Init below */
537
Antoine Pitrouf068f942010-01-13 14:19:12 +0000538#if LONG_BIT >= 128
539#define BLOOM_WIDTH 128
540#elif LONG_BIT >= 64
541#define BLOOM_WIDTH 64
542#elif LONG_BIT >= 32
543#define BLOOM_WIDTH 32
544#else
545#error "LONG_BIT is smaller than 32"
546#endif
547
Thomas Wouters477c8d52006-05-27 19:21:47 +0000548#define BLOOM_MASK unsigned long
549
Serhiy Storchaka05997252013-01-26 12:14:02 +0200550static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Antoine Pitrouf068f942010-01-13 14:19:12 +0000552#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
553#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554
Benjamin Peterson29060642009-01-31 22:14:21 +0000555#define BLOOM_LINEBREAK(ch) \
556 ((ch) < 128U ? ascii_linebreak[(ch)] : \
557 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000558
Alexander Belopolsky40018472011-02-26 01:02:56 +0000559Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200560make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000561{
562 /* calculate simple bloom-style bitmask for a given unicode string */
563
Antoine Pitrouf068f942010-01-13 14:19:12 +0000564 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000565 Py_ssize_t i;
566
567 mask = 0;
568 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200569 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000570
571 return mask;
572}
573
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200574#define BLOOM_MEMBER(mask, chr, str) \
575 (BLOOM(mask, chr) \
576 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000577
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200578/* Compilation of templated routines */
579
580#include "stringlib/asciilib.h"
581#include "stringlib/fastsearch.h"
582#include "stringlib/partition.h"
583#include "stringlib/split.h"
584#include "stringlib/count.h"
585#include "stringlib/find.h"
586#include "stringlib/find_max_char.h"
587#include "stringlib/localeutil.h"
588#include "stringlib/undef.h"
589
590#include "stringlib/ucs1lib.h"
591#include "stringlib/fastsearch.h"
592#include "stringlib/partition.h"
593#include "stringlib/split.h"
594#include "stringlib/count.h"
595#include "stringlib/find.h"
596#include "stringlib/find_max_char.h"
597#include "stringlib/localeutil.h"
598#include "stringlib/undef.h"
599
600#include "stringlib/ucs2lib.h"
601#include "stringlib/fastsearch.h"
602#include "stringlib/partition.h"
603#include "stringlib/split.h"
604#include "stringlib/count.h"
605#include "stringlib/find.h"
606#include "stringlib/find_max_char.h"
607#include "stringlib/localeutil.h"
608#include "stringlib/undef.h"
609
610#include "stringlib/ucs4lib.h"
611#include "stringlib/fastsearch.h"
612#include "stringlib/partition.h"
613#include "stringlib/split.h"
614#include "stringlib/count.h"
615#include "stringlib/find.h"
616#include "stringlib/find_max_char.h"
617#include "stringlib/localeutil.h"
618#include "stringlib/undef.h"
619
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200620#include "stringlib/unicodedefs.h"
621#include "stringlib/fastsearch.h"
622#include "stringlib/count.h"
623#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100624#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200625
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626/* --- Unicode Object ----------------------------------------------------- */
627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200628static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200629fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200630
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200631Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
632 Py_ssize_t size, Py_UCS4 ch,
633 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200634{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200635 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
636
637 switch (kind) {
638 case PyUnicode_1BYTE_KIND:
639 {
640 Py_UCS1 ch1 = (Py_UCS1) ch;
641 if (ch1 == ch)
642 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
643 else
644 return -1;
645 }
646 case PyUnicode_2BYTE_KIND:
647 {
648 Py_UCS2 ch2 = (Py_UCS2) ch;
649 if (ch2 == ch)
650 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
651 else
652 return -1;
653 }
654 case PyUnicode_4BYTE_KIND:
655 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
656 default:
657 assert(0);
658 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200660}
661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662static PyObject*
663resize_compact(PyObject *unicode, Py_ssize_t length)
664{
665 Py_ssize_t char_size;
666 Py_ssize_t struct_size;
667 Py_ssize_t new_size;
668 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100669 PyObject *new_unicode;
Victor Stinner79891572012-05-03 13:43:07 +0200670 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200671 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100672 assert(PyUnicode_IS_COMPACT(unicode));
673
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200674 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100675 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200676 struct_size = sizeof(PyASCIIObject);
677 else
678 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200679 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
682 PyErr_NoMemory();
683 return NULL;
684 }
685 new_size = (struct_size + (length + 1) * char_size);
686
Victor Stinner84def372011-12-11 20:04:56 +0100687 _Py_DEC_REFTOTAL;
688 _Py_ForgetReference(unicode);
689
690 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
691 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100692 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200693 PyErr_NoMemory();
694 return NULL;
695 }
Victor Stinner84def372011-12-11 20:04:56 +0100696 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200697 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100698
Victor Stinnerfe226c02011-10-03 03:52:20 +0200699 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200700 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200701 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100702 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200703 _PyUnicode_WSTR_LENGTH(unicode) = length;
704 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100705 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
706 PyObject_DEL(_PyUnicode_WSTR(unicode));
707 _PyUnicode_WSTR(unicode) = NULL;
708 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
710 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200711 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200712 return unicode;
713}
714
Alexander Belopolsky40018472011-02-26 01:02:56 +0000715static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200716resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717{
Victor Stinner95663112011-10-04 01:03:50 +0200718 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100719 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200720 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000722
Victor Stinnerfe226c02011-10-03 03:52:20 +0200723 if (PyUnicode_IS_READY(unicode)) {
724 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200725 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200726 void *data;
727
728 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200729 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200730 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
731 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200732
733 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
734 PyErr_NoMemory();
735 return -1;
736 }
737 new_size = (length + 1) * char_size;
738
Victor Stinner7a9105a2011-12-12 00:13:42 +0100739 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
740 {
741 PyObject_DEL(_PyUnicode_UTF8(unicode));
742 _PyUnicode_UTF8(unicode) = NULL;
743 _PyUnicode_UTF8_LENGTH(unicode) = 0;
744 }
745
Victor Stinnerfe226c02011-10-03 03:52:20 +0200746 data = (PyObject *)PyObject_REALLOC(data, new_size);
747 if (data == NULL) {
748 PyErr_NoMemory();
749 return -1;
750 }
751 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200752 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200753 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200754 _PyUnicode_WSTR_LENGTH(unicode) = length;
755 }
756 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200757 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200758 _PyUnicode_UTF8_LENGTH(unicode) = length;
759 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200760 _PyUnicode_LENGTH(unicode) = length;
761 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200762 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200763 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 }
Victor Stinner95663112011-10-04 01:03:50 +0200767 assert(_PyUnicode_WSTR(unicode) != NULL);
768
769 /* check for integer overflow */
770 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
771 PyErr_NoMemory();
772 return -1;
773 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100774 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200775 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100776 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200777 if (!wstr) {
778 PyErr_NoMemory();
779 return -1;
780 }
781 _PyUnicode_WSTR(unicode) = wstr;
782 _PyUnicode_WSTR(unicode)[length] = 0;
783 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200784 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000785 return 0;
786}
787
Victor Stinnerfe226c02011-10-03 03:52:20 +0200788static PyObject*
789resize_copy(PyObject *unicode, Py_ssize_t length)
790{
791 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100792 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200793 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100794
Benjamin Petersonbac79492012-01-14 13:34:47 -0500795 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100796 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200797
798 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
799 if (copy == NULL)
800 return NULL;
801
802 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200803 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200804 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200805 }
806 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200807 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100808
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200809 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200810 if (w == NULL)
811 return NULL;
812 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
813 copy_length = Py_MIN(copy_length, length);
814 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
815 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200816 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200817 }
818}
819
Guido van Rossumd57fd912000-03-10 22:53:23 +0000820/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000821 Ux0000 terminated; some code (e.g. new_identifier)
822 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823
824 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000825 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000826
827*/
828
Alexander Belopolsky40018472011-02-26 01:02:56 +0000829static PyUnicodeObject *
830_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000831{
832 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834
Thomas Wouters477c8d52006-05-27 19:21:47 +0000835 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000836 if (length == 0 && unicode_empty != NULL) {
837 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200838 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000839 }
840
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000841 /* Ensure we won't overflow the size. */
842 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
843 return (PyUnicodeObject *)PyErr_NoMemory();
844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200845 if (length < 0) {
846 PyErr_SetString(PyExc_SystemError,
847 "Negative size passed to _PyUnicode_New");
848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000849 }
850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200851 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
852 if (unicode == NULL)
853 return NULL;
854 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
855 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
856 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100857 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000858 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100859 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200861
Jeremy Hyltond8082792003-09-16 19:41:39 +0000862 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000863 * the caller fails before initializing str -- unicode_resize()
864 * reads str[0], and the Keep-Alive optimization can keep memory
865 * allocated for str alive across a call to unicode_dealloc(unicode).
866 * We don't want unicode_resize to read uninitialized memory in
867 * that case.
868 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200869 _PyUnicode_WSTR(unicode)[0] = 0;
870 _PyUnicode_WSTR(unicode)[length] = 0;
871 _PyUnicode_WSTR_LENGTH(unicode) = length;
872 _PyUnicode_HASH(unicode) = -1;
873 _PyUnicode_STATE(unicode).interned = 0;
874 _PyUnicode_STATE(unicode).kind = 0;
875 _PyUnicode_STATE(unicode).compact = 0;
876 _PyUnicode_STATE(unicode).ready = 0;
877 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200878 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200879 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200880 _PyUnicode_UTF8(unicode) = NULL;
881 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100882 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883 return unicode;
884}
885
Victor Stinnerf42dc442011-10-02 23:33:16 +0200886static const char*
887unicode_kind_name(PyObject *unicode)
888{
Victor Stinner42dfd712011-10-03 14:41:45 +0200889 /* don't check consistency: unicode_kind_name() is called from
890 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200891 if (!PyUnicode_IS_COMPACT(unicode))
892 {
893 if (!PyUnicode_IS_READY(unicode))
894 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600895 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200896 {
897 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200898 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200899 return "legacy ascii";
900 else
901 return "legacy latin1";
902 case PyUnicode_2BYTE_KIND:
903 return "legacy UCS2";
904 case PyUnicode_4BYTE_KIND:
905 return "legacy UCS4";
906 default:
907 return "<legacy invalid kind>";
908 }
909 }
910 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600911 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200912 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200913 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200914 return "ascii";
915 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200916 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200917 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200918 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200919 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200920 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200921 default:
922 return "<invalid compact kind>";
923 }
924}
925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200926#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200927/* Functions wrapping macros for use in debugger */
928char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200929 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200930}
931
932void *_PyUnicode_compact_data(void *unicode) {
933 return _PyUnicode_COMPACT_DATA(unicode);
934}
935void *_PyUnicode_data(void *unicode){
936 printf("obj %p\n", unicode);
937 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
938 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
939 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
940 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
941 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
942 return PyUnicode_DATA(unicode);
943}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200944
945void
946_PyUnicode_Dump(PyObject *op)
947{
948 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200949 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
950 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
951 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200952
Victor Stinnera849a4b2011-10-03 12:12:11 +0200953 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200954 {
955 if (ascii->state.ascii)
956 data = (ascii + 1);
957 else
958 data = (compact + 1);
959 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200960 else
961 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200962 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
963
Victor Stinnera849a4b2011-10-03 12:12:11 +0200964 if (ascii->wstr == data)
965 printf("shared ");
966 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200967
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200969 printf(" (%zu), ", compact->wstr_length);
970 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
971 printf("shared ");
972 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200973 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200974 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200975}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976#endif
977
978PyObject *
979PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
980{
981 PyObject *obj;
982 PyCompactUnicodeObject *unicode;
983 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +0200984 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200985 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200986 Py_ssize_t char_size;
987 Py_ssize_t struct_size;
988
989 /* Optimization for empty strings */
990 if (size == 0 && unicode_empty != NULL) {
991 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200992 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200993 }
994
Victor Stinner9e9d6892011-10-04 01:02:02 +0200995 is_ascii = 0;
996 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200997 struct_size = sizeof(PyCompactUnicodeObject);
998 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +0200999 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001000 char_size = 1;
1001 is_ascii = 1;
1002 struct_size = sizeof(PyASCIIObject);
1003 }
1004 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001005 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001006 char_size = 1;
1007 }
1008 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001009 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001010 char_size = 2;
1011 if (sizeof(wchar_t) == 2)
1012 is_sharing = 1;
1013 }
1014 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001015 if (maxchar > MAX_UNICODE) {
1016 PyErr_SetString(PyExc_SystemError,
1017 "invalid maximum character passed to PyUnicode_New");
1018 return NULL;
1019 }
Victor Stinner8f825062012-04-27 13:55:39 +02001020 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001021 char_size = 4;
1022 if (sizeof(wchar_t) == 4)
1023 is_sharing = 1;
1024 }
1025
1026 /* Ensure we won't overflow the size. */
1027 if (size < 0) {
1028 PyErr_SetString(PyExc_SystemError,
1029 "Negative size passed to PyUnicode_New");
1030 return NULL;
1031 }
1032 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1033 return PyErr_NoMemory();
1034
1035 /* Duplicated allocation code from _PyObject_New() instead of a call to
1036 * PyObject_New() so we are able to allocate space for the object and
1037 * it's data buffer.
1038 */
1039 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1040 if (obj == NULL)
1041 return PyErr_NoMemory();
1042 obj = PyObject_INIT(obj, &PyUnicode_Type);
1043 if (obj == NULL)
1044 return NULL;
1045
1046 unicode = (PyCompactUnicodeObject *)obj;
1047 if (is_ascii)
1048 data = ((PyASCIIObject*)obj) + 1;
1049 else
1050 data = unicode + 1;
1051 _PyUnicode_LENGTH(unicode) = size;
1052 _PyUnicode_HASH(unicode) = -1;
1053 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001054 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001055 _PyUnicode_STATE(unicode).compact = 1;
1056 _PyUnicode_STATE(unicode).ready = 1;
1057 _PyUnicode_STATE(unicode).ascii = is_ascii;
1058 if (is_ascii) {
1059 ((char*)data)[size] = 0;
1060 _PyUnicode_WSTR(unicode) = NULL;
1061 }
Victor Stinner8f825062012-04-27 13:55:39 +02001062 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063 ((char*)data)[size] = 0;
1064 _PyUnicode_WSTR(unicode) = NULL;
1065 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001067 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069 else {
1070 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001071 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001072 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001074 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075 ((Py_UCS4*)data)[size] = 0;
1076 if (is_sharing) {
1077 _PyUnicode_WSTR_LENGTH(unicode) = size;
1078 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1079 }
1080 else {
1081 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1082 _PyUnicode_WSTR(unicode) = NULL;
1083 }
1084 }
Victor Stinner8f825062012-04-27 13:55:39 +02001085#ifdef Py_DEBUG
1086 /* Fill the data with invalid characters to detect bugs earlier.
1087 _PyUnicode_CheckConsistency(str, 1) detects invalid characters,
1088 at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII
1089 and U+FFFFFFFF is an invalid character in Unicode 6.0. */
1090 memset(data, 0xff, size * kind);
1091#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001092 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001093 return obj;
1094}
1095
1096#if SIZEOF_WCHAR_T == 2
1097/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1098 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001099 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100
1101 This function assumes that unicode can hold one more code point than wstr
1102 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001103static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001104unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001105 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001106{
1107 const wchar_t *iter;
1108 Py_UCS4 *ucs4_out;
1109
Victor Stinner910337b2011-10-03 03:20:16 +02001110 assert(unicode != NULL);
1111 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001112 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1113 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1114
1115 for (iter = begin; iter < end; ) {
1116 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1117 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001118 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1119 && (iter+1) < end
1120 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 {
Victor Stinner551ac952011-11-29 22:58:13 +01001122 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 iter += 2;
1124 }
1125 else {
1126 *ucs4_out++ = *iter;
1127 iter++;
1128 }
1129 }
1130 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1131 _PyUnicode_GET_LENGTH(unicode)));
1132
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001133}
1134#endif
1135
Victor Stinnercd9950f2011-10-02 00:34:53 +02001136static int
Victor Stinner488fa492011-12-12 00:01:39 +01001137unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001138{
Victor Stinner488fa492011-12-12 00:01:39 +01001139 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001140 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001141 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001142 return -1;
1143 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001144 return 0;
1145}
1146
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147static int
1148_copy_characters(PyObject *to, Py_ssize_t to_start,
1149 PyObject *from, Py_ssize_t from_start,
1150 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001152 unsigned int from_kind, to_kind;
1153 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154
Victor Stinneree4544c2012-05-09 22:24:08 +02001155 assert(0 <= how_many);
1156 assert(0 <= from_start);
1157 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001158 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001159 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001160 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001161
Victor Stinnerd3f08822012-05-29 12:57:52 +02001162 assert(PyUnicode_Check(to));
1163 assert(PyUnicode_IS_READY(to));
1164 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1165
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001166 if (how_many == 0)
1167 return 0;
1168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001170 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001172 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173
Victor Stinnerf1852262012-06-16 16:38:26 +02001174#ifdef Py_DEBUG
1175 if (!check_maxchar
1176 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1177 {
1178 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1179 Py_UCS4 ch;
1180 Py_ssize_t i;
1181 for (i=0; i < how_many; i++) {
1182 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1183 assert(ch <= to_maxchar);
1184 }
1185 }
1186#endif
1187
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001188 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001189 if (check_maxchar
1190 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1191 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001192 /* Writing Latin-1 characters into an ASCII string requires to
1193 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001194 Py_UCS4 max_char;
1195 max_char = ucs1lib_find_max_char(from_data,
1196 (Py_UCS1*)from_data + how_many);
1197 if (max_char >= 128)
1198 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001199 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001200 Py_MEMCPY((char*)to_data + to_kind * to_start,
1201 (char*)from_data + from_kind * from_start,
1202 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 else if (from_kind == PyUnicode_1BYTE_KIND
1205 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001206 {
1207 _PyUnicode_CONVERT_BYTES(
1208 Py_UCS1, Py_UCS2,
1209 PyUnicode_1BYTE_DATA(from) + from_start,
1210 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1211 PyUnicode_2BYTE_DATA(to) + to_start
1212 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001213 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001214 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001215 && to_kind == PyUnicode_4BYTE_KIND)
1216 {
1217 _PyUnicode_CONVERT_BYTES(
1218 Py_UCS1, Py_UCS4,
1219 PyUnicode_1BYTE_DATA(from) + from_start,
1220 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1221 PyUnicode_4BYTE_DATA(to) + to_start
1222 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001223 }
1224 else if (from_kind == PyUnicode_2BYTE_KIND
1225 && to_kind == PyUnicode_4BYTE_KIND)
1226 {
1227 _PyUnicode_CONVERT_BYTES(
1228 Py_UCS2, Py_UCS4,
1229 PyUnicode_2BYTE_DATA(from) + from_start,
1230 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1231 PyUnicode_4BYTE_DATA(to) + to_start
1232 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001233 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001234 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001235 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1236
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001237 if (!check_maxchar) {
1238 if (from_kind == PyUnicode_2BYTE_KIND
1239 && to_kind == PyUnicode_1BYTE_KIND)
1240 {
1241 _PyUnicode_CONVERT_BYTES(
1242 Py_UCS2, Py_UCS1,
1243 PyUnicode_2BYTE_DATA(from) + from_start,
1244 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1245 PyUnicode_1BYTE_DATA(to) + to_start
1246 );
1247 }
1248 else if (from_kind == PyUnicode_4BYTE_KIND
1249 && to_kind == PyUnicode_1BYTE_KIND)
1250 {
1251 _PyUnicode_CONVERT_BYTES(
1252 Py_UCS4, Py_UCS1,
1253 PyUnicode_4BYTE_DATA(from) + from_start,
1254 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1255 PyUnicode_1BYTE_DATA(to) + to_start
1256 );
1257 }
1258 else if (from_kind == PyUnicode_4BYTE_KIND
1259 && to_kind == PyUnicode_2BYTE_KIND)
1260 {
1261 _PyUnicode_CONVERT_BYTES(
1262 Py_UCS4, Py_UCS2,
1263 PyUnicode_4BYTE_DATA(from) + from_start,
1264 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1265 PyUnicode_2BYTE_DATA(to) + to_start
1266 );
1267 }
1268 else {
1269 assert(0);
1270 return -1;
1271 }
1272 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001273 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001274 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001275 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001276 Py_ssize_t i;
1277
Victor Stinnera0702ab2011-09-29 14:14:38 +02001278 for (i=0; i < how_many; i++) {
1279 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001280 if (ch > to_maxchar)
1281 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001282 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1283 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001284 }
1285 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001286 return 0;
1287}
1288
Victor Stinnerd3f08822012-05-29 12:57:52 +02001289void
1290_PyUnicode_FastCopyCharacters(
1291 PyObject *to, Py_ssize_t to_start,
1292 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001293{
1294 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1295}
1296
1297Py_ssize_t
1298PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1299 PyObject *from, Py_ssize_t from_start,
1300 Py_ssize_t how_many)
1301{
1302 int err;
1303
1304 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1305 PyErr_BadInternalCall();
1306 return -1;
1307 }
1308
Benjamin Petersonbac79492012-01-14 13:34:47 -05001309 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001310 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001311 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001312 return -1;
1313
Victor Stinnerd3f08822012-05-29 12:57:52 +02001314 if (from_start < 0) {
1315 PyErr_SetString(PyExc_IndexError, "string index out of range");
1316 return -1;
1317 }
1318 if (to_start < 0) {
1319 PyErr_SetString(PyExc_IndexError, "string index out of range");
1320 return -1;
1321 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001322 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1323 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1324 PyErr_Format(PyExc_SystemError,
1325 "Cannot write %zi characters at %zi "
1326 "in a string of %zi characters",
1327 how_many, to_start, PyUnicode_GET_LENGTH(to));
1328 return -1;
1329 }
1330
1331 if (how_many == 0)
1332 return 0;
1333
Victor Stinner488fa492011-12-12 00:01:39 +01001334 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001335 return -1;
1336
1337 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1338 if (err) {
1339 PyErr_Format(PyExc_SystemError,
1340 "Cannot copy %s characters "
1341 "into a string of %s characters",
1342 unicode_kind_name(from),
1343 unicode_kind_name(to));
1344 return -1;
1345 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001346 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347}
1348
Victor Stinner17222162011-09-28 22:15:37 +02001349/* Find the maximum code point and count the number of surrogate pairs so a
1350 correct string length can be computed before converting a string to UCS4.
1351 This function counts single surrogates as a character and not as a pair.
1352
1353 Return 0 on success, or -1 on error. */
1354static int
1355find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1356 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357{
1358 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001359 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360
Victor Stinnerc53be962011-10-02 21:33:54 +02001361 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001362 *num_surrogates = 0;
1363 *maxchar = 0;
1364
1365 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001366#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001367 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1368 && (iter+1) < end
1369 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001370 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001371 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001373 iter += 2;
1374 }
1375 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001377 {
1378 ch = *iter;
1379 iter++;
1380 }
1381 if (ch > *maxchar) {
1382 *maxchar = ch;
1383 if (*maxchar > MAX_UNICODE) {
1384 PyErr_Format(PyExc_ValueError,
1385 "character U+%x is not in range [U+0000; U+10ffff]",
1386 ch);
1387 return -1;
1388 }
1389 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 }
1391 return 0;
1392}
1393
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001394int
1395_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396{
1397 wchar_t *end;
1398 Py_UCS4 maxchar = 0;
1399 Py_ssize_t num_surrogates;
1400#if SIZEOF_WCHAR_T == 2
1401 Py_ssize_t length_wo_surrogates;
1402#endif
1403
Georg Brandl7597add2011-10-05 16:36:47 +02001404 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001405 strings were created using _PyObject_New() and where no canonical
1406 representation (the str field) has been set yet aka strings
1407 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001408 assert(_PyUnicode_CHECK(unicode));
1409 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001411 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001412 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001413 /* Actually, it should neither be interned nor be anything else: */
1414 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001417 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001418 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420
1421 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001422 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1423 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 PyErr_NoMemory();
1425 return -1;
1426 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001427 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001428 _PyUnicode_WSTR(unicode), end,
1429 PyUnicode_1BYTE_DATA(unicode));
1430 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1431 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1432 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1433 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001434 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001435 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001436 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 }
1438 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001439 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001440 _PyUnicode_UTF8(unicode) = NULL;
1441 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 }
1443 PyObject_FREE(_PyUnicode_WSTR(unicode));
1444 _PyUnicode_WSTR(unicode) = NULL;
1445 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1446 }
1447 /* In this case we might have to convert down from 4-byte native
1448 wchar_t to 2-byte unicode. */
1449 else if (maxchar < 65536) {
1450 assert(num_surrogates == 0 &&
1451 "FindMaxCharAndNumSurrogatePairs() messed up");
1452
Victor Stinner506f5922011-09-28 22:34:18 +02001453#if SIZEOF_WCHAR_T == 2
1454 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001455 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001456 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1457 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1458 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001459 _PyUnicode_UTF8(unicode) = NULL;
1460 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001461#else
1462 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001463 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001464 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001465 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001466 PyErr_NoMemory();
1467 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 }
Victor Stinner506f5922011-09-28 22:34:18 +02001469 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1470 _PyUnicode_WSTR(unicode), end,
1471 PyUnicode_2BYTE_DATA(unicode));
1472 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1473 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1474 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001475 _PyUnicode_UTF8(unicode) = NULL;
1476 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001477 PyObject_FREE(_PyUnicode_WSTR(unicode));
1478 _PyUnicode_WSTR(unicode) = NULL;
1479 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1480#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 }
1482 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1483 else {
1484#if SIZEOF_WCHAR_T == 2
1485 /* in case the native representation is 2-bytes, we need to allocate a
1486 new normalized 4-byte version. */
1487 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001488 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1489 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 PyErr_NoMemory();
1491 return -1;
1492 }
1493 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1494 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001495 _PyUnicode_UTF8(unicode) = NULL;
1496 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001497 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1498 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001499 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001500 PyObject_FREE(_PyUnicode_WSTR(unicode));
1501 _PyUnicode_WSTR(unicode) = NULL;
1502 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1503#else
1504 assert(num_surrogates == 0);
1505
Victor Stinnerc3c74152011-10-02 20:39:55 +02001506 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001508 _PyUnicode_UTF8(unicode) = NULL;
1509 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001510 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1511#endif
1512 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1513 }
1514 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001515 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 return 0;
1517}
1518
Alexander Belopolsky40018472011-02-26 01:02:56 +00001519static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001520unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001521{
Walter Dörwald16807132007-05-25 13:52:07 +00001522 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001523 case SSTATE_NOT_INTERNED:
1524 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001525
Benjamin Peterson29060642009-01-31 22:14:21 +00001526 case SSTATE_INTERNED_MORTAL:
1527 /* revive dead object temporarily for DelItem */
1528 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001529 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001530 Py_FatalError(
1531 "deletion of interned string failed");
1532 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001533
Benjamin Peterson29060642009-01-31 22:14:21 +00001534 case SSTATE_INTERNED_IMMORTAL:
1535 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001536
Benjamin Peterson29060642009-01-31 22:14:21 +00001537 default:
1538 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001539 }
1540
Victor Stinner03490912011-10-03 23:45:12 +02001541 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001543 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001544 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001545 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1546 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001548 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549}
1550
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001551#ifdef Py_DEBUG
1552static int
1553unicode_is_singleton(PyObject *unicode)
1554{
1555 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1556 if (unicode == unicode_empty)
1557 return 1;
1558 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1559 {
1560 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1561 if (ch < 256 && unicode_latin1[ch] == unicode)
1562 return 1;
1563 }
1564 return 0;
1565}
1566#endif
1567
Alexander Belopolsky40018472011-02-26 01:02:56 +00001568static int
Victor Stinner488fa492011-12-12 00:01:39 +01001569unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001570{
Victor Stinner488fa492011-12-12 00:01:39 +01001571 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001572 if (Py_REFCNT(unicode) != 1)
1573 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001574 if (_PyUnicode_HASH(unicode) != -1)
1575 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001576 if (PyUnicode_CHECK_INTERNED(unicode))
1577 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001578 if (!PyUnicode_CheckExact(unicode))
1579 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001580#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001581 /* singleton refcount is greater than 1 */
1582 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001583#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001584 return 1;
1585}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001586
Victor Stinnerfe226c02011-10-03 03:52:20 +02001587static int
1588unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1589{
1590 PyObject *unicode;
1591 Py_ssize_t old_length;
1592
1593 assert(p_unicode != NULL);
1594 unicode = *p_unicode;
1595
1596 assert(unicode != NULL);
1597 assert(PyUnicode_Check(unicode));
1598 assert(0 <= length);
1599
Victor Stinner910337b2011-10-03 03:20:16 +02001600 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001601 old_length = PyUnicode_WSTR_LENGTH(unicode);
1602 else
1603 old_length = PyUnicode_GET_LENGTH(unicode);
1604 if (old_length == length)
1605 return 0;
1606
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001607 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001608 _Py_INCREF_UNICODE_EMPTY();
1609 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001610 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001611 Py_DECREF(*p_unicode);
1612 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001613 return 0;
1614 }
1615
Victor Stinner488fa492011-12-12 00:01:39 +01001616 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001617 PyObject *copy = resize_copy(unicode, length);
1618 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001619 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001620 Py_DECREF(*p_unicode);
1621 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001622 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001623 }
1624
Victor Stinnerfe226c02011-10-03 03:52:20 +02001625 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001626 PyObject *new_unicode = resize_compact(unicode, length);
1627 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001628 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001629 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001630 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001631 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001632 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001633}
1634
Alexander Belopolsky40018472011-02-26 01:02:56 +00001635int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001636PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001637{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001638 PyObject *unicode;
1639 if (p_unicode == NULL) {
1640 PyErr_BadInternalCall();
1641 return -1;
1642 }
1643 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001644 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001645 {
1646 PyErr_BadInternalCall();
1647 return -1;
1648 }
1649 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001650}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001651
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001652static int
Victor Stinner1b487b42012-05-03 12:29:04 +02001653unicode_widen(PyObject **p_unicode, Py_ssize_t length,
1654 unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001655{
1656 PyObject *result;
1657 assert(PyUnicode_IS_READY(*p_unicode));
Victor Stinner1b487b42012-05-03 12:29:04 +02001658 assert(length <= PyUnicode_GET_LENGTH(*p_unicode));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001659 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1660 return 0;
1661 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1662 maxchar);
1663 if (result == NULL)
1664 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +02001665 _PyUnicode_FastCopyCharacters(result, 0, *p_unicode, 0, length);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001666 Py_DECREF(*p_unicode);
1667 *p_unicode = result;
1668 return 0;
1669}
1670
1671static int
1672unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1673 Py_UCS4 ch)
1674{
Victor Stinner15e9ed22012-02-22 13:36:20 +01001675 assert(ch <= MAX_UNICODE);
Victor Stinner1b487b42012-05-03 12:29:04 +02001676 if (unicode_widen(p_unicode, *pos, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001677 return -1;
1678 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1679 PyUnicode_DATA(*p_unicode),
1680 (*pos)++, ch);
1681 return 0;
1682}
1683
Victor Stinnerc5166102012-02-22 13:55:02 +01001684/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001685
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001686 WARNING: The function doesn't copy the terminating null character and
1687 doesn't check the maximum character (may write a latin1 character in an
1688 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001689static void
1690unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1691 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001692{
1693 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1694 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001695 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001696
1697 switch (kind) {
1698 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001699 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001700 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001701 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001702 }
1703 case PyUnicode_2BYTE_KIND: {
1704 Py_UCS2 *start = (Py_UCS2 *)data + index;
1705 Py_UCS2 *ucs2 = start;
1706 assert(index <= PyUnicode_GET_LENGTH(unicode));
1707
Victor Stinner184252a2012-06-16 02:57:41 +02001708 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001709 *ucs2 = (Py_UCS2)*str;
1710
1711 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001712 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001713 }
1714 default: {
1715 Py_UCS4 *start = (Py_UCS4 *)data + index;
1716 Py_UCS4 *ucs4 = start;
1717 assert(kind == PyUnicode_4BYTE_KIND);
1718 assert(index <= PyUnicode_GET_LENGTH(unicode));
1719
Victor Stinner184252a2012-06-16 02:57:41 +02001720 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001721 *ucs4 = (Py_UCS4)*str;
1722
1723 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001724 }
1725 }
1726}
1727
1728
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729static PyObject*
1730get_latin1_char(unsigned char ch)
1731{
Victor Stinnera464fc12011-10-02 20:39:30 +02001732 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001733 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001734 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 if (!unicode)
1736 return NULL;
1737 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001738 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739 unicode_latin1[ch] = unicode;
1740 }
1741 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001742 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743}
1744
Alexander Belopolsky40018472011-02-26 01:02:56 +00001745PyObject *
1746PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001748 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 Py_UCS4 maxchar = 0;
1750 Py_ssize_t num_surrogates;
1751
1752 if (u == NULL)
1753 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001755 /* If the Unicode data is known at construction time, we can apply
1756 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001759 if (size == 0)
1760 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 /* Single character Unicode objects in the Latin-1 range are
1763 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001764 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 return get_latin1_char((unsigned char)*u);
1766
1767 /* If not empty and not single character, copy the Unicode data
1768 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001769 if (find_maxchar_surrogates(u, u + size,
1770 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 return NULL;
1772
Victor Stinner8faf8212011-12-08 22:14:11 +01001773 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 if (!unicode)
1775 return NULL;
1776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 switch (PyUnicode_KIND(unicode)) {
1778 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001779 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001780 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1781 break;
1782 case PyUnicode_2BYTE_KIND:
1783#if Py_UNICODE_SIZE == 2
1784 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1785#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001786 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1788#endif
1789 break;
1790 case PyUnicode_4BYTE_KIND:
1791#if SIZEOF_WCHAR_T == 2
1792 /* This is the only case which has to process surrogates, thus
1793 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001794 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001795#else
1796 assert(num_surrogates == 0);
1797 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1798#endif
1799 break;
1800 default:
1801 assert(0 && "Impossible state");
1802 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001803
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001804 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805}
1806
Alexander Belopolsky40018472011-02-26 01:02:56 +00001807PyObject *
1808PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001809{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001810 if (size < 0) {
1811 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001812 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001813 return NULL;
1814 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001815 if (u != NULL)
1816 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1817 else
1818 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001819}
1820
Alexander Belopolsky40018472011-02-26 01:02:56 +00001821PyObject *
1822PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001823{
1824 size_t size = strlen(u);
1825 if (size > PY_SSIZE_T_MAX) {
1826 PyErr_SetString(PyExc_OverflowError, "input too long");
1827 return NULL;
1828 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001829 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001830}
1831
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001832PyObject *
1833_PyUnicode_FromId(_Py_Identifier *id)
1834{
1835 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001836 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1837 strlen(id->string),
1838 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001839 if (!id->object)
1840 return NULL;
1841 PyUnicode_InternInPlace(&id->object);
1842 assert(!id->next);
1843 id->next = static_strings;
1844 static_strings = id;
1845 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001846 return id->object;
1847}
1848
1849void
1850_PyUnicode_ClearStaticStrings()
1851{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001852 _Py_Identifier *tmp, *s = static_strings;
1853 while (s) {
1854 Py_DECREF(s->object);
1855 s->object = NULL;
1856 tmp = s->next;
1857 s->next = NULL;
1858 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001859 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001860 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001861}
1862
Benjamin Peterson0df54292012-03-26 14:50:32 -04001863/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001864
Victor Stinnerd3f08822012-05-29 12:57:52 +02001865PyObject*
1866_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001867{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001868 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001869 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001870 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001871#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001872 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001873#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001874 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001875 }
Victor Stinner785938e2011-12-11 20:09:03 +01001876 unicode = PyUnicode_New(size, 127);
1877 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001878 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001879 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1880 assert(_PyUnicode_CheckConsistency(unicode, 1));
1881 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001882}
1883
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001884static Py_UCS4
1885kind_maxchar_limit(unsigned int kind)
1886{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001887 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001888 case PyUnicode_1BYTE_KIND:
1889 return 0x80;
1890 case PyUnicode_2BYTE_KIND:
1891 return 0x100;
1892 case PyUnicode_4BYTE_KIND:
1893 return 0x10000;
1894 default:
1895 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001896 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001897 }
1898}
1899
Victor Stinnere6abb482012-05-02 01:15:40 +02001900Py_LOCAL_INLINE(Py_UCS4)
1901align_maxchar(Py_UCS4 maxchar)
1902{
1903 if (maxchar <= 127)
1904 return 127;
1905 else if (maxchar <= 255)
1906 return 255;
1907 else if (maxchar <= 65535)
1908 return 65535;
1909 else
1910 return MAX_UNICODE;
1911}
1912
Victor Stinner702c7342011-10-05 13:50:52 +02001913static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001914_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001915{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001916 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001917 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001918
Serhiy Storchaka678db842013-01-26 12:16:36 +02001919 if (size == 0)
1920 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001921 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001922 if (size == 1)
1923 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001924
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001925 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001926 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001927 if (!res)
1928 return NULL;
1929 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001930 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001931 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001932}
1933
Victor Stinnere57b1c02011-09-28 22:20:48 +02001934static PyObject*
1935_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001936{
1937 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001938 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001939
Serhiy Storchaka678db842013-01-26 12:16:36 +02001940 if (size == 0)
1941 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001942 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001943 if (size == 1) {
1944 Py_UCS4 ch = u[0];
1945 if (ch < 256)
1946 return get_latin1_char((unsigned char)ch);
1947
1948 res = PyUnicode_New(1, ch);
1949 if (res == NULL)
1950 return NULL;
1951 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1952 assert(_PyUnicode_CheckConsistency(res, 1));
1953 return res;
1954 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001955
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001956 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001957 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 if (!res)
1959 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001960 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001961 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001962 else {
1963 _PyUnicode_CONVERT_BYTES(
1964 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1965 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001966 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 return res;
1968}
1969
Victor Stinnere57b1c02011-09-28 22:20:48 +02001970static PyObject*
1971_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001972{
1973 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001974 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001975
Serhiy Storchaka678db842013-01-26 12:16:36 +02001976 if (size == 0)
1977 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001978 assert(size > 0);
Victor Stinnerb6cd0142012-05-03 02:17:04 +02001979 if (size == 1) {
1980 Py_UCS4 ch = u[0];
1981 if (ch < 256)
1982 return get_latin1_char((unsigned char)ch);
1983
1984 res = PyUnicode_New(1, ch);
1985 if (res == NULL)
1986 return NULL;
1987 PyUnicode_WRITE(PyUnicode_KIND(res), PyUnicode_DATA(res), 0, ch);
1988 assert(_PyUnicode_CheckConsistency(res, 1));
1989 return res;
1990 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001991
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001992 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001993 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994 if (!res)
1995 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001996 if (max_char < 256)
1997 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1998 PyUnicode_1BYTE_DATA(res));
1999 else if (max_char < 0x10000)
2000 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2001 PyUnicode_2BYTE_DATA(res));
2002 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002004 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 return res;
2006}
2007
2008PyObject*
2009PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2010{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002011 if (size < 0) {
2012 PyErr_SetString(PyExc_ValueError, "size must be positive");
2013 return NULL;
2014 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002015 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002017 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002018 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002019 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002020 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002021 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002022 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002023 PyErr_SetString(PyExc_SystemError, "invalid kind");
2024 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002025 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026}
2027
Victor Stinnerece58de2012-04-23 23:36:38 +02002028Py_UCS4
2029_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2030{
2031 enum PyUnicode_Kind kind;
2032 void *startptr, *endptr;
2033
2034 assert(PyUnicode_IS_READY(unicode));
2035 assert(0 <= start);
2036 assert(end <= PyUnicode_GET_LENGTH(unicode));
2037 assert(start <= end);
2038
2039 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2040 return PyUnicode_MAX_CHAR_VALUE(unicode);
2041
2042 if (start == end)
2043 return 127;
2044
Victor Stinner94d558b2012-04-27 22:26:58 +02002045 if (PyUnicode_IS_ASCII(unicode))
2046 return 127;
2047
Victor Stinnerece58de2012-04-23 23:36:38 +02002048 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002049 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002050 endptr = (char *)startptr + end * kind;
2051 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002052 switch(kind) {
2053 case PyUnicode_1BYTE_KIND:
2054 return ucs1lib_find_max_char(startptr, endptr);
2055 case PyUnicode_2BYTE_KIND:
2056 return ucs2lib_find_max_char(startptr, endptr);
2057 case PyUnicode_4BYTE_KIND:
2058 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002059 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002060 assert(0);
2061 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002062 }
2063}
2064
Victor Stinner25a4b292011-10-06 12:31:55 +02002065/* Ensure that a string uses the most efficient storage, if it is not the
2066 case: create a new string with of the right kind. Write NULL into *p_unicode
2067 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002068static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002069unicode_adjust_maxchar(PyObject **p_unicode)
2070{
2071 PyObject *unicode, *copy;
2072 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002073 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002074 unsigned int kind;
2075
2076 assert(p_unicode != NULL);
2077 unicode = *p_unicode;
2078 assert(PyUnicode_IS_READY(unicode));
2079 if (PyUnicode_IS_ASCII(unicode))
2080 return;
2081
2082 len = PyUnicode_GET_LENGTH(unicode);
2083 kind = PyUnicode_KIND(unicode);
2084 if (kind == PyUnicode_1BYTE_KIND) {
2085 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002086 max_char = ucs1lib_find_max_char(u, u + len);
2087 if (max_char >= 128)
2088 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002089 }
2090 else if (kind == PyUnicode_2BYTE_KIND) {
2091 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002092 max_char = ucs2lib_find_max_char(u, u + len);
2093 if (max_char >= 256)
2094 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002095 }
2096 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002097 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002098 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002099 max_char = ucs4lib_find_max_char(u, u + len);
2100 if (max_char >= 0x10000)
2101 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002102 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002103 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002104 if (copy != NULL)
2105 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002106 Py_DECREF(unicode);
2107 *p_unicode = copy;
2108}
2109
Victor Stinner034f6cf2011-09-30 02:26:44 +02002110PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002111_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002112{
Victor Stinner87af4f22011-11-21 23:03:47 +01002113 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002114 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002115
Victor Stinner034f6cf2011-09-30 02:26:44 +02002116 if (!PyUnicode_Check(unicode)) {
2117 PyErr_BadInternalCall();
2118 return NULL;
2119 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002120 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002121 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002122
Victor Stinner87af4f22011-11-21 23:03:47 +01002123 length = PyUnicode_GET_LENGTH(unicode);
2124 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002125 if (!copy)
2126 return NULL;
2127 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2128
Victor Stinner87af4f22011-11-21 23:03:47 +01002129 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2130 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002131 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002132 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002133}
2134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002135
Victor Stinnerbc603d12011-10-02 01:00:40 +02002136/* Widen Unicode objects to larger buffers. Don't write terminating null
2137 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138
2139void*
2140_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2141{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002142 Py_ssize_t len;
2143 void *result;
2144 unsigned int skind;
2145
Benjamin Petersonbac79492012-01-14 13:34:47 -05002146 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002147 return NULL;
2148
2149 len = PyUnicode_GET_LENGTH(s);
2150 skind = PyUnicode_KIND(s);
2151 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002152 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002153 return NULL;
2154 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002155 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002156 case PyUnicode_2BYTE_KIND:
2157 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2158 if (!result)
2159 return PyErr_NoMemory();
2160 assert(skind == PyUnicode_1BYTE_KIND);
2161 _PyUnicode_CONVERT_BYTES(
2162 Py_UCS1, Py_UCS2,
2163 PyUnicode_1BYTE_DATA(s),
2164 PyUnicode_1BYTE_DATA(s) + len,
2165 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002167 case PyUnicode_4BYTE_KIND:
2168 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2169 if (!result)
2170 return PyErr_NoMemory();
2171 if (skind == PyUnicode_2BYTE_KIND) {
2172 _PyUnicode_CONVERT_BYTES(
2173 Py_UCS2, Py_UCS4,
2174 PyUnicode_2BYTE_DATA(s),
2175 PyUnicode_2BYTE_DATA(s) + len,
2176 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002178 else {
2179 assert(skind == PyUnicode_1BYTE_KIND);
2180 _PyUnicode_CONVERT_BYTES(
2181 Py_UCS1, Py_UCS4,
2182 PyUnicode_1BYTE_DATA(s),
2183 PyUnicode_1BYTE_DATA(s) + len,
2184 result);
2185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002187 default:
2188 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002189 }
Victor Stinner01698042011-10-04 00:04:26 +02002190 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002191 return NULL;
2192}
2193
2194static Py_UCS4*
2195as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2196 int copy_null)
2197{
2198 int kind;
2199 void *data;
2200 Py_ssize_t len, targetlen;
2201 if (PyUnicode_READY(string) == -1)
2202 return NULL;
2203 kind = PyUnicode_KIND(string);
2204 data = PyUnicode_DATA(string);
2205 len = PyUnicode_GET_LENGTH(string);
2206 targetlen = len;
2207 if (copy_null)
2208 targetlen++;
2209 if (!target) {
2210 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2211 PyErr_NoMemory();
2212 return NULL;
2213 }
2214 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2215 if (!target) {
2216 PyErr_NoMemory();
2217 return NULL;
2218 }
2219 }
2220 else {
2221 if (targetsize < targetlen) {
2222 PyErr_Format(PyExc_SystemError,
2223 "string is longer than the buffer");
2224 if (copy_null && 0 < targetsize)
2225 target[0] = 0;
2226 return NULL;
2227 }
2228 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002229 if (kind == PyUnicode_1BYTE_KIND) {
2230 Py_UCS1 *start = (Py_UCS1 *) data;
2231 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002232 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002233 else if (kind == PyUnicode_2BYTE_KIND) {
2234 Py_UCS2 *start = (Py_UCS2 *) data;
2235 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2236 }
2237 else {
2238 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002240 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 if (copy_null)
2242 target[len] = 0;
2243 return target;
2244}
2245
2246Py_UCS4*
2247PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2248 int copy_null)
2249{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002250 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002251 PyErr_BadInternalCall();
2252 return NULL;
2253 }
2254 return as_ucs4(string, target, targetsize, copy_null);
2255}
2256
2257Py_UCS4*
2258PyUnicode_AsUCS4Copy(PyObject *string)
2259{
2260 return as_ucs4(string, NULL, 0, 1);
2261}
2262
2263#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002264
Alexander Belopolsky40018472011-02-26 01:02:56 +00002265PyObject *
2266PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002268 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002269 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002270 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002271 PyErr_BadInternalCall();
2272 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 }
2274
Martin v. Löwis790465f2008-04-05 20:41:37 +00002275 if (size == -1) {
2276 size = wcslen(w);
2277 }
2278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002280}
2281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002283
Walter Dörwald346737f2007-05-31 10:44:43 +00002284static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002285makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2286 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002287{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002288 *fmt++ = '%';
2289 if (width) {
2290 if (zeropad)
2291 *fmt++ = '0';
2292 fmt += sprintf(fmt, "%d", width);
2293 }
2294 if (precision)
2295 fmt += sprintf(fmt, ".%d", precision);
2296 if (longflag)
2297 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002298 else if (longlongflag) {
2299 /* longlongflag should only ever be nonzero on machines with
2300 HAVE_LONG_LONG defined */
2301#ifdef HAVE_LONG_LONG
2302 char *f = PY_FORMAT_LONG_LONG;
2303 while (*f)
2304 *fmt++ = *f++;
2305#else
2306 /* we shouldn't ever get here */
2307 assert(0);
2308 *fmt++ = 'l';
2309#endif
2310 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002311 else if (size_tflag) {
2312 char *f = PY_FORMAT_SIZE_T;
2313 while (*f)
2314 *fmt++ = *f++;
2315 }
2316 *fmt++ = c;
2317 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002318}
2319
Victor Stinner96865452011-03-01 23:44:09 +00002320/* helper for PyUnicode_FromFormatV() */
2321
2322static const char*
2323parse_format_flags(const char *f,
2324 int *p_width, int *p_precision,
2325 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2326{
2327 int width, precision, longflag, longlongflag, size_tflag;
2328
2329 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2330 f++;
2331 width = 0;
2332 while (Py_ISDIGIT((unsigned)*f))
2333 width = (width*10) + *f++ - '0';
2334 precision = 0;
2335 if (*f == '.') {
2336 f++;
2337 while (Py_ISDIGIT((unsigned)*f))
2338 precision = (precision*10) + *f++ - '0';
2339 if (*f == '%') {
2340 /* "%.3%s" => f points to "3" */
2341 f--;
2342 }
2343 }
2344 if (*f == '\0') {
2345 /* bogus format "%.1" => go backward, f points to "1" */
2346 f--;
2347 }
2348 if (p_width != NULL)
2349 *p_width = width;
2350 if (p_precision != NULL)
2351 *p_precision = precision;
2352
2353 /* Handle %ld, %lu, %lld and %llu. */
2354 longflag = 0;
2355 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002356 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002357
2358 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002359 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002360 longflag = 1;
2361 ++f;
2362 }
2363#ifdef HAVE_LONG_LONG
2364 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002365 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002366 longlongflag = 1;
2367 f += 2;
2368 }
2369#endif
2370 }
2371 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002372 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002373 size_tflag = 1;
2374 ++f;
2375 }
2376 if (p_longflag != NULL)
2377 *p_longflag = longflag;
2378 if (p_longlongflag != NULL)
2379 *p_longlongflag = longlongflag;
2380 if (p_size_tflag != NULL)
2381 *p_size_tflag = size_tflag;
2382 return f;
2383}
2384
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002385/* maximum number of characters required for output of %ld. 21 characters
2386 allows for 64-bit integers (in decimal) and an optional sign. */
2387#define MAX_LONG_CHARS 21
2388/* maximum number of characters required for output of %lld.
2389 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2390 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2391#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2392
Walter Dörwaldd2034312007-05-18 16:29:38 +00002393PyObject *
2394PyUnicode_FromFormatV(const char *format, va_list vargs)
2395{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002396 va_list count;
2397 Py_ssize_t callcount = 0;
2398 PyObject **callresults = NULL;
2399 PyObject **callresult = NULL;
2400 Py_ssize_t n = 0;
2401 int width = 0;
2402 int precision = 0;
2403 int zeropad;
2404 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002405 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002406 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002407 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2409 Py_UCS4 argmaxchar;
2410 Py_ssize_t numbersize = 0;
2411 char *numberresults = NULL;
2412 char *numberresult = NULL;
2413 Py_ssize_t i;
2414 int kind;
2415 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002416
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002417 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002418 /* step 1: count the number of %S/%R/%A/%s format specifications
2419 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2420 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002422 * also estimate a upper bound for all the number formats in the string,
2423 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002424 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002425 for (f = format; *f; f++) {
2426 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002427 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2429 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2430 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2431 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002434#ifdef HAVE_LONG_LONG
2435 if (longlongflag) {
2436 if (width < MAX_LONG_LONG_CHARS)
2437 width = MAX_LONG_LONG_CHARS;
2438 }
2439 else
2440#endif
2441 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2442 including sign. Decimal takes the most space. This
2443 isn't enough for octal. If a width is specified we
2444 need more (which we allocate later). */
2445 if (width < MAX_LONG_CHARS)
2446 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447
2448 /* account for the size + '\0' to separate numbers
2449 inside of the numberresults buffer */
2450 numbersize += (width + 1);
2451 }
2452 }
2453 else if ((unsigned char)*f > 127) {
2454 PyErr_Format(PyExc_ValueError,
2455 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2456 "string, got a non-ASCII byte: 0x%02x",
2457 (unsigned char)*f);
2458 return NULL;
2459 }
2460 }
2461 /* step 2: allocate memory for the results of
2462 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2463 if (callcount) {
2464 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2465 if (!callresults) {
2466 PyErr_NoMemory();
2467 return NULL;
2468 }
2469 callresult = callresults;
2470 }
2471 /* step 2.5: allocate memory for the results of formating numbers */
2472 if (numbersize) {
2473 numberresults = PyObject_Malloc(numbersize);
2474 if (!numberresults) {
2475 PyErr_NoMemory();
2476 goto fail;
2477 }
2478 numberresult = numberresults;
2479 }
2480
2481 /* step 3: format numbers and figure out how large a buffer we need */
2482 for (f = format; *f; f++) {
2483 if (*f == '%') {
2484 const char* p;
2485 int longflag;
2486 int longlongflag;
2487 int size_tflag;
2488 int numprinted;
2489
2490 p = f;
2491 zeropad = (f[1] == '0');
2492 f = parse_format_flags(f, &width, &precision,
2493 &longflag, &longlongflag, &size_tflag);
2494 switch (*f) {
2495 case 'c':
2496 {
2497 Py_UCS4 ordinal = va_arg(count, int);
Victor Stinnere6abb482012-05-02 01:15:40 +02002498 maxchar = MAX_MAXCHAR(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 n++;
2500 break;
2501 }
2502 case '%':
2503 n++;
2504 break;
2505 case 'i':
2506 case 'd':
2507 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2508 width, precision, *f);
2509 if (longflag)
2510 numprinted = sprintf(numberresult, fmt,
2511 va_arg(count, long));
2512#ifdef HAVE_LONG_LONG
2513 else if (longlongflag)
2514 numprinted = sprintf(numberresult, fmt,
2515 va_arg(count, PY_LONG_LONG));
2516#endif
2517 else if (size_tflag)
2518 numprinted = sprintf(numberresult, fmt,
2519 va_arg(count, Py_ssize_t));
2520 else
2521 numprinted = sprintf(numberresult, fmt,
2522 va_arg(count, int));
2523 n += numprinted;
2524 /* advance by +1 to skip over the '\0' */
2525 numberresult += (numprinted + 1);
2526 assert(*(numberresult - 1) == '\0');
2527 assert(*(numberresult - 2) != '\0');
2528 assert(numprinted >= 0);
2529 assert(numberresult <= numberresults + numbersize);
2530 break;
2531 case 'u':
2532 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2533 width, precision, 'u');
2534 if (longflag)
2535 numprinted = sprintf(numberresult, fmt,
2536 va_arg(count, unsigned long));
2537#ifdef HAVE_LONG_LONG
2538 else if (longlongflag)
2539 numprinted = sprintf(numberresult, fmt,
2540 va_arg(count, unsigned PY_LONG_LONG));
2541#endif
2542 else if (size_tflag)
2543 numprinted = sprintf(numberresult, fmt,
2544 va_arg(count, size_t));
2545 else
2546 numprinted = sprintf(numberresult, fmt,
2547 va_arg(count, unsigned int));
2548 n += numprinted;
2549 numberresult += (numprinted + 1);
2550 assert(*(numberresult - 1) == '\0');
2551 assert(*(numberresult - 2) != '\0');
2552 assert(numprinted >= 0);
2553 assert(numberresult <= numberresults + numbersize);
2554 break;
2555 case 'x':
2556 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2557 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2558 n += numprinted;
2559 numberresult += (numprinted + 1);
2560 assert(*(numberresult - 1) == '\0');
2561 assert(*(numberresult - 2) != '\0');
2562 assert(numprinted >= 0);
2563 assert(numberresult <= numberresults + numbersize);
2564 break;
2565 case 'p':
2566 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2567 /* %p is ill-defined: ensure leading 0x. */
2568 if (numberresult[1] == 'X')
2569 numberresult[1] = 'x';
2570 else if (numberresult[1] != 'x') {
2571 memmove(numberresult + 2, numberresult,
2572 strlen(numberresult) + 1);
2573 numberresult[0] = '0';
2574 numberresult[1] = 'x';
2575 numprinted += 2;
2576 }
2577 n += numprinted;
2578 numberresult += (numprinted + 1);
2579 assert(*(numberresult - 1) == '\0');
2580 assert(*(numberresult - 2) != '\0');
2581 assert(numprinted >= 0);
2582 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 break;
2584 case 's':
2585 {
2586 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002587 const char *s = va_arg(count, const char*);
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002588 PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002589 if (!str)
2590 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002591 /* since PyUnicode_DecodeUTF8 returns already flexible
2592 unicode objects, there is no need to call ready on them */
2593 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002594 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002595 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002596 /* Remember the str and switch to the next slot */
2597 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002598 break;
2599 }
2600 case 'U':
2601 {
2602 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002603 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002604 if (PyUnicode_READY(obj) == -1)
2605 goto fail;
2606 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002607 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002609 break;
2610 }
2611 case 'V':
2612 {
2613 PyObject *obj = va_arg(count, PyObject *);
2614 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002615 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002616 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002617 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002618 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002619 if (PyUnicode_READY(obj) == -1)
2620 goto fail;
2621 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002622 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002623 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002624 *callresult++ = NULL;
2625 }
2626 else {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002627 str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002628 if (!str_obj)
2629 goto fail;
Benjamin Petersonbac79492012-01-14 13:34:47 -05002630 if (PyUnicode_READY(str_obj) == -1) {
Victor Stinnere1335c72011-10-04 20:53:03 +02002631 Py_DECREF(str_obj);
2632 goto fail;
2633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Victor Stinnere6abb482012-05-02 01:15:40 +02002635 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002636 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002637 *callresult++ = str_obj;
2638 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002639 break;
2640 }
2641 case 'S':
2642 {
2643 PyObject *obj = va_arg(count, PyObject *);
2644 PyObject *str;
2645 assert(obj);
2646 str = PyObject_Str(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002647 if (!str)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002649 if (PyUnicode_READY(str) == -1) {
2650 Py_DECREF(str);
2651 goto fail;
2652 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002653 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Victor Stinnere6abb482012-05-02 01:15:40 +02002654 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002655 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002656 /* Remember the str and switch to the next slot */
2657 *callresult++ = str;
2658 break;
2659 }
2660 case 'R':
2661 {
2662 PyObject *obj = va_arg(count, PyObject *);
2663 PyObject *repr;
2664 assert(obj);
2665 repr = PyObject_Repr(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002666 if (!repr)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002667 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002668 if (PyUnicode_READY(repr) == -1) {
2669 Py_DECREF(repr);
2670 goto fail;
2671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002672 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Victor Stinnere6abb482012-05-02 01:15:40 +02002673 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002675 /* Remember the repr and switch to the next slot */
2676 *callresult++ = repr;
2677 break;
2678 }
2679 case 'A':
2680 {
2681 PyObject *obj = va_arg(count, PyObject *);
2682 PyObject *ascii;
2683 assert(obj);
2684 ascii = PyObject_ASCII(obj);
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002685 if (!ascii)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002686 goto fail;
Benjamin Petersonc8d8b882012-01-14 13:37:31 -05002687 if (PyUnicode_READY(ascii) == -1) {
2688 Py_DECREF(ascii);
2689 goto fail;
2690 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002691 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Victor Stinnere6abb482012-05-02 01:15:40 +02002692 maxchar = MAX_MAXCHAR(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002693 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002694 /* Remember the repr and switch to the next slot */
2695 *callresult++ = ascii;
2696 break;
2697 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002698 default:
2699 /* if we stumble upon an unknown
2700 formatting code, copy the rest of
2701 the format string to the output
2702 string. (we cannot just skip the
2703 code, since there's no way to know
2704 what's in the argument list) */
2705 n += strlen(p);
2706 goto expand;
2707 }
2708 } else
2709 n++;
2710 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002711 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002712 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002713 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002714 we don't have to resize the string.
2715 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002716 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002717 if (!string)
2718 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002719 kind = PyUnicode_KIND(string);
2720 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002721 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002722 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002724 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002725 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002726 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002727
2728 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2730 /* checking for == because the last argument could be a empty
2731 string, which causes i to point to end, the assert at the end of
2732 the loop */
2733 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002734
Benjamin Peterson14339b62009-01-31 16:36:08 +00002735 switch (*f) {
2736 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002737 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002738 const int ordinal = va_arg(vargs, int);
2739 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002740 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002741 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002742 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002743 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002744 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002745 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002746 case 'p':
Victor Stinnerc5166102012-02-22 13:55:02 +01002747 {
Victor Stinner184252a2012-06-16 02:57:41 +02002748 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002749 /* unused, since we already have the result */
2750 if (*f == 'p')
2751 (void) va_arg(vargs, void *);
2752 else
2753 (void) va_arg(vargs, int);
2754 /* extract the result from numberresults and append. */
Victor Stinner184252a2012-06-16 02:57:41 +02002755 len = strlen(numberresult);
2756 unicode_write_cstr(string, i, numberresult, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002757 /* skip over the separating '\0' */
Victor Stinner184252a2012-06-16 02:57:41 +02002758 i += len;
2759 numberresult += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002760 assert(*numberresult == '\0');
2761 numberresult++;
2762 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002763 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002764 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002765 case 's':
2766 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002767 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002768 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002769 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002770 size = PyUnicode_GET_LENGTH(*callresult);
2771 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002772 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002773 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002774 /* We're done with the unicode()/repr() => forget it */
2775 Py_DECREF(*callresult);
2776 /* switch to next unicode()/repr() result */
2777 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002778 break;
2779 }
2780 case 'U':
2781 {
2782 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002783 Py_ssize_t size;
2784 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2785 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerd3f08822012-05-29 12:57:52 +02002786 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002787 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002788 break;
2789 }
2790 case 'V':
2791 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002792 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002793 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002794 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002795 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002796 size = PyUnicode_GET_LENGTH(obj);
2797 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002798 _PyUnicode_FastCopyCharacters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002799 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002800 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002801 size = PyUnicode_GET_LENGTH(*callresult);
2802 assert(PyUnicode_KIND(*callresult) <=
2803 PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002804 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002805 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002806 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002807 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002808 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002809 break;
2810 }
2811 case 'S':
2812 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002813 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002814 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002815 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002816 /* unused, since we already have the result */
2817 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002818 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerd3f08822012-05-29 12:57:52 +02002819 _PyUnicode_FastCopyCharacters(string, i, *callresult, 0, size);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002820 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002821 /* We're done with the unicode()/repr() => forget it */
2822 Py_DECREF(*callresult);
2823 /* switch to next unicode()/repr() result */
2824 ++callresult;
2825 break;
2826 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002827 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002828 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002829 break;
2830 default:
Victor Stinner184252a2012-06-16 02:57:41 +02002831 {
2832 Py_ssize_t len = strlen(p);
2833 unicode_write_cstr(string, i, p, len);
2834 i += len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002835 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002836 goto end;
2837 }
Victor Stinner184252a2012-06-16 02:57:41 +02002838 }
Victor Stinner1205f272010-09-11 00:54:47 +00002839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002840 else {
2841 assert(i < PyUnicode_GET_LENGTH(string));
2842 PyUnicode_WRITE(kind, data, i++, *f);
2843 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002845 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002846
Benjamin Peterson29060642009-01-31 22:14:21 +00002847 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002848 if (callresults)
2849 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002850 if (numberresults)
2851 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002852 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002853 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002854 if (callresults) {
2855 PyObject **callresult2 = callresults;
2856 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002857 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002858 ++callresult2;
2859 }
2860 PyObject_Free(callresults);
2861 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002862 if (numberresults)
2863 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002864 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002865}
2866
Walter Dörwaldd2034312007-05-18 16:29:38 +00002867PyObject *
2868PyUnicode_FromFormat(const char *format, ...)
2869{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002870 PyObject* ret;
2871 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002872
2873#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002874 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002875#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002876 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002877#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002878 ret = PyUnicode_FromFormatV(format, vargs);
2879 va_end(vargs);
2880 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002881}
2882
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002883#ifdef HAVE_WCHAR_H
2884
Victor Stinner5593d8a2010-10-02 11:11:27 +00002885/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2886 convert a Unicode object to a wide character string.
2887
Victor Stinnerd88d9832011-09-06 02:00:05 +02002888 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002889 character) required to convert the unicode object. Ignore size argument.
2890
Victor Stinnerd88d9832011-09-06 02:00:05 +02002891 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002892 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002893 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002894static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002895unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002896 wchar_t *w,
2897 Py_ssize_t size)
2898{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002899 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002900 const wchar_t *wstr;
2901
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002902 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002903 if (wstr == NULL)
2904 return -1;
2905
Victor Stinner5593d8a2010-10-02 11:11:27 +00002906 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002907 if (size > res)
2908 size = res + 1;
2909 else
2910 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002911 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002912 return res;
2913 }
2914 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002915 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002916}
2917
2918Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002919PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002920 wchar_t *w,
2921 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922{
2923 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002924 PyErr_BadInternalCall();
2925 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002927 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002928}
2929
Victor Stinner137c34c2010-09-29 10:25:54 +00002930wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002931PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002932 Py_ssize_t *size)
2933{
2934 wchar_t* buffer;
2935 Py_ssize_t buflen;
2936
2937 if (unicode == NULL) {
2938 PyErr_BadInternalCall();
2939 return NULL;
2940 }
2941
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002942 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002943 if (buflen == -1)
2944 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002945 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002946 PyErr_NoMemory();
2947 return NULL;
2948 }
2949
Victor Stinner137c34c2010-09-29 10:25:54 +00002950 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2951 if (buffer == NULL) {
2952 PyErr_NoMemory();
2953 return NULL;
2954 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002955 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002956 if (buflen == -1) {
2957 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002958 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002959 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002960 if (size != NULL)
2961 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002962 return buffer;
2963}
2964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002965#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966
Alexander Belopolsky40018472011-02-26 01:02:56 +00002967PyObject *
2968PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002970 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002971 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002972 PyErr_SetString(PyExc_ValueError,
2973 "chr() arg not in range(0x110000)");
2974 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002975 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002976
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002977 if ((Py_UCS4)ordinal < 256)
2978 return get_latin1_char((unsigned char)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002980 v = PyUnicode_New(1, ordinal);
2981 if (v == NULL)
2982 return NULL;
2983 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002984 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002985 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002986}
2987
Alexander Belopolsky40018472011-02-26 01:02:56 +00002988PyObject *
2989PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002991 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002992 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002993 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002994 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002995 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002996 Py_INCREF(obj);
2997 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002998 }
2999 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003000 /* For a Unicode subtype that's not a Unicode object,
3001 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003002 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003003 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003004 PyErr_Format(PyExc_TypeError,
3005 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00003006 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003007 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003008}
3009
Alexander Belopolsky40018472011-02-26 01:02:56 +00003010PyObject *
3011PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003012 const char *encoding,
3013 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003014{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003015 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003016 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003017
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003019 PyErr_BadInternalCall();
3020 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003022
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003023 /* Decoding bytes objects is the most common case and should be fast */
3024 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003025 if (PyBytes_GET_SIZE(obj) == 0)
3026 _Py_RETURN_UNICODE_EMPTY();
3027 v = PyUnicode_Decode(
3028 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3029 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003030 return v;
3031 }
3032
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003033 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003034 PyErr_SetString(PyExc_TypeError,
3035 "decoding str is not supported");
3036 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003037 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003038
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003039 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3040 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3041 PyErr_Format(PyExc_TypeError,
3042 "coercing to str: need bytes, bytearray "
3043 "or buffer-like object, %.80s found",
3044 Py_TYPE(obj)->tp_name);
3045 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003046 }
Tim Petersced69f82003-09-16 20:30:58 +00003047
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003048 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003049 PyBuffer_Release(&buffer);
3050 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003052
Serhiy Storchaka05997252013-01-26 12:14:02 +02003053 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003054 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003055 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003056}
3057
Victor Stinner600d3be2010-06-10 12:00:55 +00003058/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00003059 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
3060 1 on success. */
Victor Stinner20b654a2013-01-03 01:08:58 +01003061int
3062_Py_normalize_encoding(const char *encoding,
Victor Stinner37296e82010-06-10 13:36:23 +00003063 char *lower,
3064 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003065{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003066 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003067 char *l;
3068 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003069
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04003070 if (encoding == NULL) {
3071 strcpy(lower, "utf-8");
3072 return 1;
3073 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003074 e = encoding;
3075 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003076 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00003077 while (*e) {
3078 if (l == l_end)
3079 return 0;
David Malcolm96960882010-11-05 17:23:41 +00003080 if (Py_ISUPPER(*e)) {
3081 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003082 }
3083 else if (*e == '_') {
3084 *l++ = '-';
3085 e++;
3086 }
3087 else {
3088 *l++ = *e++;
3089 }
3090 }
3091 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003092 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003093}
3094
Alexander Belopolsky40018472011-02-26 01:02:56 +00003095PyObject *
3096PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003097 Py_ssize_t size,
3098 const char *encoding,
3099 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003100{
3101 PyObject *buffer = NULL, *unicode;
3102 Py_buffer info;
3103 char lower[11]; /* Enough for any encoding shortcut */
3104
Fred Drakee4315f52000-05-09 19:53:39 +00003105 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01003106 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003107 if ((strcmp(lower, "utf-8") == 0) ||
3108 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003109 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00003110 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003111 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003112 (strcmp(lower, "iso-8859-1") == 0))
3113 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003114#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003115 else if (strcmp(lower, "mbcs") == 0)
3116 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003117#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003118 else if (strcmp(lower, "ascii") == 0)
3119 return PyUnicode_DecodeASCII(s, size, errors);
3120 else if (strcmp(lower, "utf-16") == 0)
3121 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3122 else if (strcmp(lower, "utf-32") == 0)
3123 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3124 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125
3126 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003127 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003128 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003129 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003130 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131 if (buffer == NULL)
3132 goto onError;
3133 unicode = PyCodec_Decode(buffer, encoding, errors);
3134 if (unicode == NULL)
3135 goto onError;
3136 if (!PyUnicode_Check(unicode)) {
3137 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003138 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00003139 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003140 Py_DECREF(unicode);
3141 goto onError;
3142 }
3143 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003144 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003145
Benjamin Peterson29060642009-01-31 22:14:21 +00003146 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003147 Py_XDECREF(buffer);
3148 return NULL;
3149}
3150
Alexander Belopolsky40018472011-02-26 01:02:56 +00003151PyObject *
3152PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003153 const char *encoding,
3154 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003155{
3156 PyObject *v;
3157
3158 if (!PyUnicode_Check(unicode)) {
3159 PyErr_BadArgument();
3160 goto onError;
3161 }
3162
3163 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003164 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003165
3166 /* Decode via the codec registry */
3167 v = PyCodec_Decode(unicode, encoding, errors);
3168 if (v == NULL)
3169 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003170 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003171
Benjamin Peterson29060642009-01-31 22:14:21 +00003172 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003173 return NULL;
3174}
3175
Alexander Belopolsky40018472011-02-26 01:02:56 +00003176PyObject *
3177PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003178 const char *encoding,
3179 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003180{
3181 PyObject *v;
3182
3183 if (!PyUnicode_Check(unicode)) {
3184 PyErr_BadArgument();
3185 goto onError;
3186 }
3187
3188 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003189 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003190
3191 /* Decode via the codec registry */
3192 v = PyCodec_Decode(unicode, encoding, errors);
3193 if (v == NULL)
3194 goto onError;
3195 if (!PyUnicode_Check(v)) {
3196 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003197 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003198 Py_TYPE(v)->tp_name);
3199 Py_DECREF(v);
3200 goto onError;
3201 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003202 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003203
Benjamin Peterson29060642009-01-31 22:14:21 +00003204 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003205 return NULL;
3206}
3207
Alexander Belopolsky40018472011-02-26 01:02:56 +00003208PyObject *
3209PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003210 Py_ssize_t size,
3211 const char *encoding,
3212 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213{
3214 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003215
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 unicode = PyUnicode_FromUnicode(s, size);
3217 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003218 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3220 Py_DECREF(unicode);
3221 return v;
3222}
3223
Alexander Belopolsky40018472011-02-26 01:02:56 +00003224PyObject *
3225PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003226 const char *encoding,
3227 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003228{
3229 PyObject *v;
3230
3231 if (!PyUnicode_Check(unicode)) {
3232 PyErr_BadArgument();
3233 goto onError;
3234 }
3235
3236 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003237 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003238
3239 /* Encode via the codec registry */
3240 v = PyCodec_Encode(unicode, encoding, errors);
3241 if (v == NULL)
3242 goto onError;
3243 return v;
3244
Benjamin Peterson29060642009-01-31 22:14:21 +00003245 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003246 return NULL;
3247}
3248
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003249static size_t
3250wcstombs_errorpos(const wchar_t *wstr)
3251{
3252 size_t len;
3253#if SIZEOF_WCHAR_T == 2
3254 wchar_t buf[3];
3255#else
3256 wchar_t buf[2];
3257#endif
3258 char outbuf[MB_LEN_MAX];
3259 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003260
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003261#if SIZEOF_WCHAR_T == 2
3262 buf[2] = 0;
3263#else
3264 buf[1] = 0;
3265#endif
3266 start = wstr;
3267 while (*wstr != L'\0')
3268 {
3269 previous = wstr;
3270#if SIZEOF_WCHAR_T == 2
3271 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3272 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3273 {
3274 buf[0] = wstr[0];
3275 buf[1] = wstr[1];
3276 wstr += 2;
3277 }
3278 else {
3279 buf[0] = *wstr;
3280 buf[1] = 0;
3281 wstr++;
3282 }
3283#else
3284 buf[0] = *wstr;
3285 wstr++;
3286#endif
3287 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003288 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003289 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003290 }
3291
3292 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003293 return 0;
3294}
3295
Victor Stinner1b579672011-12-17 05:47:23 +01003296static int
3297locale_error_handler(const char *errors, int *surrogateescape)
3298{
3299 if (errors == NULL) {
3300 *surrogateescape = 0;
3301 return 0;
3302 }
3303
3304 if (strcmp(errors, "strict") == 0) {
3305 *surrogateescape = 0;
3306 return 0;
3307 }
3308 if (strcmp(errors, "surrogateescape") == 0) {
3309 *surrogateescape = 1;
3310 return 0;
3311 }
3312 PyErr_Format(PyExc_ValueError,
3313 "only 'strict' and 'surrogateescape' error handlers "
3314 "are supported, not '%s'",
3315 errors);
3316 return -1;
3317}
3318
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003319PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003320PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003321{
3322 Py_ssize_t wlen, wlen2;
3323 wchar_t *wstr;
3324 PyObject *bytes = NULL;
3325 char *errmsg;
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003326 PyObject *reason;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003327 PyObject *exc;
3328 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003329 int surrogateescape;
3330
3331 if (locale_error_handler(errors, &surrogateescape) < 0)
3332 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003333
3334 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3335 if (wstr == NULL)
3336 return NULL;
3337
3338 wlen2 = wcslen(wstr);
3339 if (wlen2 != wlen) {
3340 PyMem_Free(wstr);
3341 PyErr_SetString(PyExc_TypeError, "embedded null character");
3342 return NULL;
3343 }
3344
3345 if (surrogateescape) {
3346 /* locale encoding with surrogateescape */
3347 char *str;
3348
3349 str = _Py_wchar2char(wstr, &error_pos);
3350 if (str == NULL) {
3351 if (error_pos == (size_t)-1) {
3352 PyErr_NoMemory();
3353 PyMem_Free(wstr);
3354 return NULL;
3355 }
3356 else {
3357 goto encode_error;
3358 }
3359 }
3360 PyMem_Free(wstr);
3361
3362 bytes = PyBytes_FromString(str);
3363 PyMem_Free(str);
3364 }
3365 else {
3366 size_t len, len2;
3367
3368 len = wcstombs(NULL, wstr, 0);
3369 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003370 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003371 goto encode_error;
3372 }
3373
3374 bytes = PyBytes_FromStringAndSize(NULL, len);
3375 if (bytes == NULL) {
3376 PyMem_Free(wstr);
3377 return NULL;
3378 }
3379
3380 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3381 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003382 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003383 goto encode_error;
3384 }
3385 PyMem_Free(wstr);
3386 }
3387 return bytes;
3388
3389encode_error:
3390 errmsg = strerror(errno);
3391 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003392
3393 if (error_pos == (size_t)-1)
3394 error_pos = wcstombs_errorpos(wstr);
3395
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003396 PyMem_Free(wstr);
3397 Py_XDECREF(bytes);
3398
Victor Stinner2f197072011-12-17 07:08:30 +01003399 if (errmsg != NULL) {
3400 size_t errlen;
3401 wstr = _Py_char2wchar(errmsg, &errlen);
3402 if (wstr != NULL) {
3403 reason = PyUnicode_FromWideChar(wstr, errlen);
3404 PyMem_Free(wstr);
3405 } else
3406 errmsg = NULL;
3407 }
3408 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003409 reason = PyUnicode_FromString(
3410 "wcstombs() encountered an unencodable "
3411 "wide character");
3412 if (reason == NULL)
3413 return NULL;
3414
3415 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3416 "locale", unicode,
3417 (Py_ssize_t)error_pos,
3418 (Py_ssize_t)(error_pos+1),
3419 reason);
3420 Py_DECREF(reason);
3421 if (exc != NULL) {
3422 PyCodec_StrictErrors(exc);
3423 Py_XDECREF(exc);
3424 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003425 return NULL;
3426}
3427
Victor Stinnerad158722010-10-27 00:25:46 +00003428PyObject *
3429PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003430{
Victor Stinner99b95382011-07-04 14:23:54 +02003431#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003432 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003433#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003434 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003435#else
Victor Stinner793b5312011-04-27 00:24:21 +02003436 PyInterpreterState *interp = PyThreadState_GET()->interp;
3437 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3438 cannot use it to encode and decode filenames before it is loaded. Load
3439 the Python codec requires to encode at least its own filename. Use the C
3440 version of the locale codec until the codec registry is initialized and
3441 the Python codec is loaded.
3442
3443 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3444 cannot only rely on it: check also interp->fscodec_initialized for
3445 subinterpreters. */
3446 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003447 return PyUnicode_AsEncodedString(unicode,
3448 Py_FileSystemDefaultEncoding,
3449 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003450 }
3451 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003452 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003453 }
Victor Stinnerad158722010-10-27 00:25:46 +00003454#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003455}
3456
Alexander Belopolsky40018472011-02-26 01:02:56 +00003457PyObject *
3458PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003459 const char *encoding,
3460 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003461{
3462 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003463 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003464
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 if (!PyUnicode_Check(unicode)) {
3466 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003467 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 }
Fred Drakee4315f52000-05-09 19:53:39 +00003469
Fred Drakee4315f52000-05-09 19:53:39 +00003470 /* Shortcuts for common default encodings */
Victor Stinner20b654a2013-01-03 01:08:58 +01003471 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003472 if ((strcmp(lower, "utf-8") == 0) ||
3473 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003474 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003475 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003476 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003477 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003478 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003479 }
Victor Stinner37296e82010-06-10 13:36:23 +00003480 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003481 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003482 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003483 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003484#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003485 else if (strcmp(lower, "mbcs") == 0)
3486 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003487#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003488 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003489 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003490 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491
3492 /* Encode via the codec registry */
3493 v = PyCodec_Encode(unicode, encoding, errors);
3494 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003495 return NULL;
3496
3497 /* The normal path */
3498 if (PyBytes_Check(v))
3499 return v;
3500
3501 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003502 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003503 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003504 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003505
3506 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3507 "encoder %s returned bytearray instead of bytes",
3508 encoding);
3509 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003510 Py_DECREF(v);
3511 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003512 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003513
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003514 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3515 Py_DECREF(v);
3516 return b;
3517 }
3518
3519 PyErr_Format(PyExc_TypeError,
3520 "encoder did not return a bytes object (type=%.400s)",
3521 Py_TYPE(v)->tp_name);
3522 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003523 return NULL;
3524}
3525
Alexander Belopolsky40018472011-02-26 01:02:56 +00003526PyObject *
3527PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003528 const char *encoding,
3529 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003530{
3531 PyObject *v;
3532
3533 if (!PyUnicode_Check(unicode)) {
3534 PyErr_BadArgument();
3535 goto onError;
3536 }
3537
3538 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003539 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003540
3541 /* Encode via the codec registry */
3542 v = PyCodec_Encode(unicode, encoding, errors);
3543 if (v == NULL)
3544 goto onError;
3545 if (!PyUnicode_Check(v)) {
3546 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003547 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003548 Py_TYPE(v)->tp_name);
3549 Py_DECREF(v);
3550 goto onError;
3551 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003553
Benjamin Peterson29060642009-01-31 22:14:21 +00003554 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003555 return NULL;
3556}
3557
Victor Stinner2f197072011-12-17 07:08:30 +01003558static size_t
3559mbstowcs_errorpos(const char *str, size_t len)
3560{
3561#ifdef HAVE_MBRTOWC
3562 const char *start = str;
3563 mbstate_t mbs;
3564 size_t converted;
3565 wchar_t ch;
3566
3567 memset(&mbs, 0, sizeof mbs);
3568 while (len)
3569 {
3570 converted = mbrtowc(&ch, (char*)str, len, &mbs);
3571 if (converted == 0)
3572 /* Reached end of string */
3573 break;
3574 if (converted == (size_t)-1 || converted == (size_t)-2) {
3575 /* Conversion error or incomplete character */
3576 return str - start;
3577 }
3578 else {
3579 str += converted;
3580 len -= converted;
3581 }
3582 }
3583 /* failed to find the undecodable byte sequence */
3584 return 0;
3585#endif
3586 return 0;
3587}
3588
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003589PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003590PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003591 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003592{
3593 wchar_t smallbuf[256];
3594 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3595 wchar_t *wstr;
3596 size_t wlen, wlen2;
3597 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003598 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003599 size_t error_pos;
3600 char *errmsg;
3601 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003602
3603 if (locale_error_handler(errors, &surrogateescape) < 0)
3604 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003605
3606 if (str[len] != '\0' || len != strlen(str)) {
3607 PyErr_SetString(PyExc_TypeError, "embedded null character");
3608 return NULL;
3609 }
3610
3611 if (surrogateescape)
3612 {
3613 wstr = _Py_char2wchar(str, &wlen);
3614 if (wstr == NULL) {
3615 if (wlen == (size_t)-1)
3616 PyErr_NoMemory();
3617 else
3618 PyErr_SetFromErrno(PyExc_OSError);
3619 return NULL;
3620 }
3621
3622 unicode = PyUnicode_FromWideChar(wstr, wlen);
3623 PyMem_Free(wstr);
3624 }
3625 else {
3626#ifndef HAVE_BROKEN_MBSTOWCS
3627 wlen = mbstowcs(NULL, str, 0);
3628#else
3629 wlen = len;
3630#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003631 if (wlen == (size_t)-1)
3632 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003633 if (wlen+1 <= smallbuf_len) {
3634 wstr = smallbuf;
3635 }
3636 else {
3637 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3638 return PyErr_NoMemory();
3639
3640 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3641 if (!wstr)
3642 return PyErr_NoMemory();
3643 }
3644
3645 /* This shouldn't fail now */
3646 wlen2 = mbstowcs(wstr, str, wlen+1);
3647 if (wlen2 == (size_t)-1) {
3648 if (wstr != smallbuf)
3649 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003650 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003651 }
3652#ifdef HAVE_BROKEN_MBSTOWCS
3653 assert(wlen2 == wlen);
3654#endif
3655 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3656 if (wstr != smallbuf)
3657 PyMem_Free(wstr);
3658 }
3659 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003660
3661decode_error:
3662 errmsg = strerror(errno);
3663 assert(errmsg != NULL);
3664
3665 error_pos = mbstowcs_errorpos(str, len);
3666 if (errmsg != NULL) {
3667 size_t errlen;
3668 wstr = _Py_char2wchar(errmsg, &errlen);
3669 if (wstr != NULL) {
3670 reason = PyUnicode_FromWideChar(wstr, errlen);
3671 PyMem_Free(wstr);
3672 } else
3673 errmsg = NULL;
3674 }
3675 if (errmsg == NULL)
3676 reason = PyUnicode_FromString(
3677 "mbstowcs() encountered an invalid multibyte sequence");
3678 if (reason == NULL)
3679 return NULL;
3680
3681 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3682 "locale", str, len,
3683 (Py_ssize_t)error_pos,
3684 (Py_ssize_t)(error_pos+1),
3685 reason);
3686 Py_DECREF(reason);
3687 if (exc != NULL) {
3688 PyCodec_StrictErrors(exc);
3689 Py_XDECREF(exc);
3690 }
3691 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003692}
3693
3694PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003695PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003696{
3697 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003698 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003699}
3700
3701
3702PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003703PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003704 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003705 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3706}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003707
Christian Heimes5894ba72007-11-04 11:43:14 +00003708PyObject*
3709PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3710{
Victor Stinner99b95382011-07-04 14:23:54 +02003711#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003712 return PyUnicode_DecodeMBCS(s, size, NULL);
3713#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003714 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003715#else
Victor Stinner793b5312011-04-27 00:24:21 +02003716 PyInterpreterState *interp = PyThreadState_GET()->interp;
3717 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3718 cannot use it to encode and decode filenames before it is loaded. Load
3719 the Python codec requires to encode at least its own filename. Use the C
3720 version of the locale codec until the codec registry is initialized and
3721 the Python codec is loaded.
3722
3723 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3724 cannot only rely on it: check also interp->fscodec_initialized for
3725 subinterpreters. */
3726 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003727 return PyUnicode_Decode(s, size,
3728 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003729 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003730 }
3731 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003732 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003733 }
Victor Stinnerad158722010-10-27 00:25:46 +00003734#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003735}
3736
Martin v. Löwis011e8422009-05-05 04:43:17 +00003737
3738int
Antoine Pitrou13348842012-01-29 18:36:34 +01003739_PyUnicode_HasNULChars(PyObject* s)
3740{
3741 static PyObject *nul = NULL;
3742
3743 if (nul == NULL)
3744 nul = PyUnicode_FromStringAndSize("\0", 1);
3745 if (nul == NULL)
3746 return -1;
3747 return PyUnicode_Contains(s, nul);
3748}
3749
3750
3751int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003752PyUnicode_FSConverter(PyObject* arg, void* addr)
3753{
3754 PyObject *output = NULL;
3755 Py_ssize_t size;
3756 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003757 if (arg == NULL) {
3758 Py_DECREF(*(PyObject**)addr);
3759 return 1;
3760 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003761 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003762 output = arg;
3763 Py_INCREF(output);
3764 }
3765 else {
3766 arg = PyUnicode_FromObject(arg);
3767 if (!arg)
3768 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003769 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003770 Py_DECREF(arg);
3771 if (!output)
3772 return 0;
3773 if (!PyBytes_Check(output)) {
3774 Py_DECREF(output);
3775 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3776 return 0;
3777 }
3778 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003779 size = PyBytes_GET_SIZE(output);
3780 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003781 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003782 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003783 Py_DECREF(output);
3784 return 0;
3785 }
3786 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003787 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003788}
3789
3790
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003791int
3792PyUnicode_FSDecoder(PyObject* arg, void* addr)
3793{
3794 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003795 if (arg == NULL) {
3796 Py_DECREF(*(PyObject**)addr);
3797 return 1;
3798 }
3799 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003800 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003801 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003802 output = arg;
3803 Py_INCREF(output);
3804 }
3805 else {
3806 arg = PyBytes_FromObject(arg);
3807 if (!arg)
3808 return 0;
3809 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3810 PyBytes_GET_SIZE(arg));
3811 Py_DECREF(arg);
3812 if (!output)
3813 return 0;
3814 if (!PyUnicode_Check(output)) {
3815 Py_DECREF(output);
3816 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3817 return 0;
3818 }
3819 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003820 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003821 Py_DECREF(output);
3822 return 0;
3823 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003824 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003825 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003826 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3827 Py_DECREF(output);
3828 return 0;
3829 }
3830 *(PyObject**)addr = output;
3831 return Py_CLEANUP_SUPPORTED;
3832}
3833
3834
Martin v. Löwis5b222132007-06-10 09:51:05 +00003835char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003837{
Christian Heimesf3863112007-11-22 07:46:41 +00003838 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003839
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003840 if (!PyUnicode_Check(unicode)) {
3841 PyErr_BadArgument();
3842 return NULL;
3843 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003844 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003845 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003846
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003847 if (PyUnicode_UTF8(unicode) == NULL) {
3848 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003849 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3850 if (bytes == NULL)
3851 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003852 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3853 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003854 Py_DECREF(bytes);
3855 return NULL;
3856 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003857 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3858 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3859 PyBytes_AS_STRING(bytes),
3860 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003861 Py_DECREF(bytes);
3862 }
3863
3864 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003865 *psize = PyUnicode_UTF8_LENGTH(unicode);
3866 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003867}
3868
3869char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003870PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003871{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003872 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3873}
3874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875Py_UNICODE *
3876PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003878 const unsigned char *one_byte;
3879#if SIZEOF_WCHAR_T == 4
3880 const Py_UCS2 *two_bytes;
3881#else
3882 const Py_UCS4 *four_bytes;
3883 const Py_UCS4 *ucs4_end;
3884 Py_ssize_t num_surrogates;
3885#endif
3886 wchar_t *w;
3887 wchar_t *wchar_end;
3888
3889 if (!PyUnicode_Check(unicode)) {
3890 PyErr_BadArgument();
3891 return NULL;
3892 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003893 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003894 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003895 assert(_PyUnicode_KIND(unicode) != 0);
3896 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003897
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003898 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003899#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003900 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3901 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003902 num_surrogates = 0;
3903
3904 for (; four_bytes < ucs4_end; ++four_bytes) {
3905 if (*four_bytes > 0xFFFF)
3906 ++num_surrogates;
3907 }
3908
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003909 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3910 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3911 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003912 PyErr_NoMemory();
3913 return NULL;
3914 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003915 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003916
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003917 w = _PyUnicode_WSTR(unicode);
3918 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3919 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3921 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003922 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003924 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3925 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926 }
3927 else
3928 *w = *four_bytes;
3929
3930 if (w > wchar_end) {
3931 assert(0 && "Miscalculated string end");
3932 }
3933 }
3934 *w = 0;
3935#else
3936 /* sizeof(wchar_t) == 4 */
3937 Py_FatalError("Impossible unicode object state, wstr and str "
3938 "should share memory already.");
3939 return NULL;
3940#endif
3941 }
3942 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003943 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3944 (_PyUnicode_LENGTH(unicode) + 1));
3945 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003946 PyErr_NoMemory();
3947 return NULL;
3948 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003949 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3950 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3951 w = _PyUnicode_WSTR(unicode);
3952 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003953
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003954 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3955 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003956 for (; w < wchar_end; ++one_byte, ++w)
3957 *w = *one_byte;
3958 /* null-terminate the wstr */
3959 *w = 0;
3960 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003961 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003962#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003963 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964 for (; w < wchar_end; ++two_bytes, ++w)
3965 *w = *two_bytes;
3966 /* null-terminate the wstr */
3967 *w = 0;
3968#else
3969 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003970 PyObject_FREE(_PyUnicode_WSTR(unicode));
3971 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003972 Py_FatalError("Impossible unicode object state, wstr "
3973 "and str should share memory already.");
3974 return NULL;
3975#endif
3976 }
3977 else {
3978 assert(0 && "This should never happen.");
3979 }
3980 }
3981 }
3982 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003983 *size = PyUnicode_WSTR_LENGTH(unicode);
3984 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003985}
3986
Alexander Belopolsky40018472011-02-26 01:02:56 +00003987Py_UNICODE *
3988PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003989{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003990 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991}
3992
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003993
Alexander Belopolsky40018472011-02-26 01:02:56 +00003994Py_ssize_t
3995PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996{
3997 if (!PyUnicode_Check(unicode)) {
3998 PyErr_BadArgument();
3999 goto onError;
4000 }
4001 return PyUnicode_GET_SIZE(unicode);
4002
Benjamin Peterson29060642009-01-31 22:14:21 +00004003 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 return -1;
4005}
4006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007Py_ssize_t
4008PyUnicode_GetLength(PyObject *unicode)
4009{
Victor Stinner07621332012-06-16 04:53:46 +02004010 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011 PyErr_BadArgument();
4012 return -1;
4013 }
Victor Stinner07621332012-06-16 04:53:46 +02004014 if (PyUnicode_READY(unicode) == -1)
4015 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016 return PyUnicode_GET_LENGTH(unicode);
4017}
4018
4019Py_UCS4
4020PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4021{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004022 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
4023 PyErr_BadArgument();
4024 return (Py_UCS4)-1;
4025 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004026 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004027 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004028 return (Py_UCS4)-1;
4029 }
4030 return PyUnicode_READ_CHAR(unicode, index);
4031}
4032
4033int
4034PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4035{
4036 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004037 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038 return -1;
4039 }
Victor Stinner488fa492011-12-12 00:01:39 +01004040 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004041 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004042 PyErr_SetString(PyExc_IndexError, "string index out of range");
4043 return -1;
4044 }
Victor Stinner488fa492011-12-12 00:01:39 +01004045 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004046 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004047 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4048 PyErr_SetString(PyExc_ValueError, "character out of range");
4049 return -1;
4050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4052 index, ch);
4053 return 0;
4054}
4055
Alexander Belopolsky40018472011-02-26 01:02:56 +00004056const char *
4057PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004058{
Victor Stinner42cb4622010-09-01 19:39:01 +00004059 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004060}
4061
Victor Stinner554f3f02010-06-16 23:33:54 +00004062/* create or adjust a UnicodeDecodeError */
4063static void
4064make_decode_exception(PyObject **exceptionObject,
4065 const char *encoding,
4066 const char *input, Py_ssize_t length,
4067 Py_ssize_t startpos, Py_ssize_t endpos,
4068 const char *reason)
4069{
4070 if (*exceptionObject == NULL) {
4071 *exceptionObject = PyUnicodeDecodeError_Create(
4072 encoding, input, length, startpos, endpos, reason);
4073 }
4074 else {
4075 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4076 goto onError;
4077 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4078 goto onError;
4079 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4080 goto onError;
4081 }
4082 return;
4083
4084onError:
4085 Py_DECREF(*exceptionObject);
4086 *exceptionObject = NULL;
4087}
4088
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004089/* error handling callback helper:
4090 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004091 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004092 and adjust various state variables.
4093 return 0 on success, -1 on error
4094*/
4095
Alexander Belopolsky40018472011-02-26 01:02:56 +00004096static int
4097unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004098 const char *encoding, const char *reason,
4099 const char **input, const char **inend, Py_ssize_t *startinpos,
4100 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004101 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004102{
Benjamin Peterson142957c2008-07-04 19:55:29 +00004103 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004104
4105 PyObject *restuple = NULL;
4106 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004107 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004108 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004109 Py_ssize_t requiredsize;
4110 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004111 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004112 int res = -1;
4113
Victor Stinner596a6c42011-11-09 00:02:18 +01004114 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
4115 outsize = PyUnicode_GET_LENGTH(*output);
4116 else
4117 outsize = _PyUnicode_WSTR_LENGTH(*output);
4118
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004119 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 *errorHandler = PyCodec_LookupError(errors);
4121 if (*errorHandler == NULL)
4122 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004123 }
4124
Victor Stinner554f3f02010-06-16 23:33:54 +00004125 make_decode_exception(exceptionObject,
4126 encoding,
4127 *input, *inend - *input,
4128 *startinpos, *endinpos,
4129 reason);
4130 if (*exceptionObject == NULL)
4131 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004132
4133 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4134 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004135 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004136 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004137 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004138 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139 }
4140 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004141 goto onError;
Benjamin Petersonbac79492012-01-14 13:34:47 -05004142 if (PyUnicode_READY(repunicode) == -1)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004143 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004144
4145 /* Copy back the bytes variables, which might have been modified by the
4146 callback */
4147 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4148 if (!inputobj)
4149 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004150 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004151 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004152 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004153 *input = PyBytes_AS_STRING(inputobj);
4154 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004155 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004156 /* we can DECREF safely, as the exception has another reference,
4157 so the object won't go away. */
4158 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004159
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004161 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004162 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004163 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4164 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004165 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166
Victor Stinner596a6c42011-11-09 00:02:18 +01004167 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
4168 /* need more space? (at least enough for what we
4169 have+the replacement+the rest of the string (starting
4170 at the new input position), so we won't have to check space
4171 when there are no errors in the rest of the string) */
4172 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
4173 requiredsize = *outpos + replen + insize-newpos;
4174 if (requiredsize > outsize) {
4175 if (requiredsize<2*outsize)
4176 requiredsize = 2*outsize;
4177 if (unicode_resize(output, requiredsize) < 0)
4178 goto onError;
4179 }
Victor Stinner1b487b42012-05-03 12:29:04 +02004180 if (unicode_widen(output, *outpos,
4181 PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004182 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +02004183 _PyUnicode_FastCopyCharacters(*output, *outpos, repunicode, 0, replen);
Victor Stinner596a6c42011-11-09 00:02:18 +01004184 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004185 }
Victor Stinner596a6c42011-11-09 00:02:18 +01004186 else {
4187 wchar_t *repwstr;
4188 Py_ssize_t repwlen;
4189 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4190 if (repwstr == NULL)
4191 goto onError;
4192 /* need more space? (at least enough for what we
4193 have+the replacement+the rest of the string (starting
4194 at the new input position), so we won't have to check space
4195 when there are no errors in the rest of the string) */
4196 requiredsize = *outpos + repwlen + insize-newpos;
4197 if (requiredsize > outsize) {
4198 if (requiredsize < 2*outsize)
4199 requiredsize = 2*outsize;
4200 if (unicode_resize(output, requiredsize) < 0)
4201 goto onError;
4202 }
4203 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4204 *outpos += repwlen;
4205 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004206 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004207 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004208
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004209 /* we made it! */
4210 res = 0;
4211
Benjamin Peterson29060642009-01-31 22:14:21 +00004212 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004213 Py_XDECREF(restuple);
4214 return res;
4215}
4216
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004217/* --- UTF-7 Codec -------------------------------------------------------- */
4218
Antoine Pitrou244651a2009-05-04 18:56:13 +00004219/* See RFC2152 for details. We encode conservatively and decode liberally. */
4220
4221/* Three simple macros defining base-64. */
4222
4223/* Is c a base-64 character? */
4224
4225#define IS_BASE64(c) \
4226 (((c) >= 'A' && (c) <= 'Z') || \
4227 ((c) >= 'a' && (c) <= 'z') || \
4228 ((c) >= '0' && (c) <= '9') || \
4229 (c) == '+' || (c) == '/')
4230
4231/* given that c is a base-64 character, what is its base-64 value? */
4232
4233#define FROM_BASE64(c) \
4234 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4235 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4236 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4237 (c) == '+' ? 62 : 63)
4238
4239/* What is the base-64 character of the bottom 6 bits of n? */
4240
4241#define TO_BASE64(n) \
4242 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4243
4244/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4245 * decoded as itself. We are permissive on decoding; the only ASCII
4246 * byte not decoding to itself is the + which begins a base64
4247 * string. */
4248
4249#define DECODE_DIRECT(c) \
4250 ((c) <= 127 && (c) != '+')
4251
4252/* The UTF-7 encoder treats ASCII characters differently according to
4253 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4254 * the above). See RFC2152. This array identifies these different
4255 * sets:
4256 * 0 : "Set D"
4257 * alphanumeric and '(),-./:?
4258 * 1 : "Set O"
4259 * !"#$%&*;<=>@[]^_`{|}
4260 * 2 : "whitespace"
4261 * ht nl cr sp
4262 * 3 : special (must be base64 encoded)
4263 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4264 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004265
Tim Petersced69f82003-09-16 20:30:58 +00004266static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004267char utf7_category[128] = {
4268/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4269 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4270/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4271 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4272/* sp ! " # $ % & ' ( ) * + , - . / */
4273 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4274/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4275 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4276/* @ A B C D E F G H I J K L M N O */
4277 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4278/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4279 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4280/* ` a b c d e f g h i j k l m n o */
4281 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4282/* p q r s t u v w x y z { | } ~ del */
4283 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004284};
4285
Antoine Pitrou244651a2009-05-04 18:56:13 +00004286/* ENCODE_DIRECT: this character should be encoded as itself. The
4287 * answer depends on whether we are encoding set O as itself, and also
4288 * on whether we are encoding whitespace as itself. RFC2152 makes it
4289 * clear that the answers to these questions vary between
4290 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004291
Antoine Pitrou244651a2009-05-04 18:56:13 +00004292#define ENCODE_DIRECT(c, directO, directWS) \
4293 ((c) < 128 && (c) > 0 && \
4294 ((utf7_category[(c)] == 0) || \
4295 (directWS && (utf7_category[(c)] == 2)) || \
4296 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004297
Alexander Belopolsky40018472011-02-26 01:02:56 +00004298PyObject *
4299PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004300 Py_ssize_t size,
4301 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004302{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004303 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4304}
4305
Antoine Pitrou244651a2009-05-04 18:56:13 +00004306/* The decoder. The only state we preserve is our read position,
4307 * i.e. how many characters we have consumed. So if we end in the
4308 * middle of a shift sequence we have to back off the read position
4309 * and the output to the beginning of the sequence, otherwise we lose
4310 * all the shift state (seen bits, number of bits seen, high
4311 * surrogate). */
4312
Alexander Belopolsky40018472011-02-26 01:02:56 +00004313PyObject *
4314PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004315 Py_ssize_t size,
4316 const char *errors,
4317 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004318{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004319 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004320 Py_ssize_t startinpos;
4321 Py_ssize_t endinpos;
4322 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004323 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004324 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004325 const char *errmsg = "";
4326 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004327 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004328 unsigned int base64bits = 0;
4329 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004330 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004331 PyObject *errorHandler = NULL;
4332 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004333
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004334 /* Start off assuming it's all ASCII. Widen later as necessary. */
4335 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004336 if (!unicode)
4337 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004338 if (size == 0) {
4339 if (consumed)
4340 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004341 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004342 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004343
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004344 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004345 e = s + size;
4346
4347 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004348 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004349 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004350 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004351
Antoine Pitrou244651a2009-05-04 18:56:13 +00004352 if (inShift) { /* in a base-64 section */
4353 if (IS_BASE64(ch)) { /* consume a base-64 character */
4354 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4355 base64bits += 6;
4356 s++;
4357 if (base64bits >= 16) {
4358 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004359 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004360 base64bits -= 16;
4361 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4362 if (surrogate) {
4363 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004364 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4365 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004366 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
4367 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004369 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004370 }
4371 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004372 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4373 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004374 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004375 }
4376 }
Victor Stinner551ac952011-11-29 22:58:13 +01004377 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004378 /* first surrogate */
4379 surrogate = outCh;
4380 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004382 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
4383 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004384 }
4385 }
4386 }
4387 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004388 inShift = 0;
4389 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004390 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01004391 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
4392 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004393 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004394 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004395 if (base64bits > 0) { /* left-over bits */
4396 if (base64bits >= 6) {
4397 /* We've seen at least one base-64 character */
4398 errmsg = "partial character in shift sequence";
4399 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004400 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004401 else {
4402 /* Some bits remain; they should be zero */
4403 if (base64buffer != 0) {
4404 errmsg = "non-zero padding bits in shift sequence";
4405 goto utf7Error;
4406 }
4407 }
4408 }
4409 if (ch != '-') {
4410 /* '-' is absorbed; other terminating
4411 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004412 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4413 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004414 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415 }
4416 }
4417 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004418 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004419 s++; /* consume '+' */
4420 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004421 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004422 if (unicode_putchar(&unicode, &outpos, '+') < 0)
4423 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004424 }
4425 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004426 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004427 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004429 }
4430 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004431 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004432 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4433 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004434 s++;
4435 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004436 else {
4437 startinpos = s-starts;
4438 s++;
4439 errmsg = "unexpected special character";
4440 goto utf7Error;
4441 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004442 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004443utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004444 endinpos = s-starts;
4445 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004446 errors, &errorHandler,
4447 "utf7", errmsg,
4448 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004449 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004450 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004451 }
4452
Antoine Pitrou244651a2009-05-04 18:56:13 +00004453 /* end of string */
4454
4455 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4456 /* if we're in an inconsistent state, that's an error */
4457 if (surrogate ||
4458 (base64bits >= 6) ||
4459 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004460 endinpos = size;
4461 if (unicode_decode_call_errorhandler(
4462 errors, &errorHandler,
4463 "utf7", "unterminated shift sequence",
4464 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004465 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 goto onError;
4467 if (s < e)
4468 goto restart;
4469 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004470 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004471
4472 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004473 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004474 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004475 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004476 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004477 }
4478 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004479 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004480 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004481 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004482
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004483 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484 goto onError;
4485
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004486 Py_XDECREF(errorHandler);
4487 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004488 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004489
Benjamin Peterson29060642009-01-31 22:14:21 +00004490 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004491 Py_XDECREF(errorHandler);
4492 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004493 Py_DECREF(unicode);
4494 return NULL;
4495}
4496
4497
Alexander Belopolsky40018472011-02-26 01:02:56 +00004498PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004499_PyUnicode_EncodeUTF7(PyObject *str,
4500 int base64SetO,
4501 int base64WhiteSpace,
4502 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004503{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004504 int kind;
4505 void *data;
4506 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004507 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004508 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004509 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510 unsigned int base64bits = 0;
4511 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004512 char * out;
4513 char * start;
4514
Benjamin Petersonbac79492012-01-14 13:34:47 -05004515 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004516 return NULL;
4517 kind = PyUnicode_KIND(str);
4518 data = PyUnicode_DATA(str);
4519 len = PyUnicode_GET_LENGTH(str);
4520
4521 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004523
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004524 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004525 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004526 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004527 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528 if (v == NULL)
4529 return NULL;
4530
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004531 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004532 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004533 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004534
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535 if (inShift) {
4536 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4537 /* shifting out */
4538 if (base64bits) { /* output remaining bits */
4539 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4540 base64buffer = 0;
4541 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542 }
4543 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004544 /* Characters not in the BASE64 set implicitly unshift the sequence
4545 so no '-' is required, except if the character is itself a '-' */
4546 if (IS_BASE64(ch) || ch == '-') {
4547 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004548 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004549 *out++ = (char) ch;
4550 }
4551 else {
4552 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004553 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004555 else { /* not in a shift sequence */
4556 if (ch == '+') {
4557 *out++ = '+';
4558 *out++ = '-';
4559 }
4560 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4561 *out++ = (char) ch;
4562 }
4563 else {
4564 *out++ = '+';
4565 inShift = 1;
4566 goto encode_char;
4567 }
4568 }
4569 continue;
4570encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004571 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004572 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004573
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 /* code first surrogate */
4575 base64bits += 16;
4576 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4577 while (base64bits >= 6) {
4578 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4579 base64bits -= 6;
4580 }
4581 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004582 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 base64bits += 16;
4585 base64buffer = (base64buffer << 16) | ch;
4586 while (base64bits >= 6) {
4587 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4588 base64bits -= 6;
4589 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004590 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004591 if (base64bits)
4592 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4593 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004594 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004595 if (_PyBytes_Resize(&v, out - start) < 0)
4596 return NULL;
4597 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004599PyObject *
4600PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4601 Py_ssize_t size,
4602 int base64SetO,
4603 int base64WhiteSpace,
4604 const char *errors)
4605{
4606 PyObject *result;
4607 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4608 if (tmp == NULL)
4609 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004610 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004611 base64WhiteSpace, errors);
4612 Py_DECREF(tmp);
4613 return result;
4614}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004615
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616#undef IS_BASE64
4617#undef FROM_BASE64
4618#undef TO_BASE64
4619#undef DECODE_DIRECT
4620#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004621
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622/* --- UTF-8 Codec -------------------------------------------------------- */
4623
Alexander Belopolsky40018472011-02-26 01:02:56 +00004624PyObject *
4625PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004626 Py_ssize_t size,
4627 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628{
Walter Dörwald69652032004-09-07 20:24:22 +00004629 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4630}
4631
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004632#include "stringlib/asciilib.h"
4633#include "stringlib/codecs.h"
4634#include "stringlib/undef.h"
4635
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004636#include "stringlib/ucs1lib.h"
4637#include "stringlib/codecs.h"
4638#include "stringlib/undef.h"
4639
4640#include "stringlib/ucs2lib.h"
4641#include "stringlib/codecs.h"
4642#include "stringlib/undef.h"
4643
4644#include "stringlib/ucs4lib.h"
4645#include "stringlib/codecs.h"
4646#include "stringlib/undef.h"
4647
Antoine Pitrouab868312009-01-10 15:40:25 +00004648/* Mask to quickly check whether a C 'long' contains a
4649 non-ASCII, UTF8-encoded char. */
4650#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004651# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004652#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004653# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004654#else
4655# error C 'long' size should be either 4 or 8!
4656#endif
4657
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004658static Py_ssize_t
4659ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004660{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004661 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004662 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004663
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004664#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004665 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4666 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004667 /* Fast path, see in STRINGLIB(utf8_decode) for
4668 an explanation. */
4669 /* Help register allocation */
4670 register const char *_p = p;
4671 register Py_UCS1 * q = dest;
4672 while (_p < aligned_end) {
4673 unsigned long value = *(const unsigned long *) _p;
4674 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004676 *((unsigned long *)q) = value;
4677 _p += SIZEOF_LONG;
4678 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004679 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004680 p = _p;
4681 while (p < end) {
4682 if ((unsigned char)*p & 0x80)
4683 break;
4684 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004686 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004688#endif
4689 while (p < end) {
4690 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4691 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004692 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004693 /* Help register allocation */
4694 register const char *_p = p;
4695 while (_p < aligned_end) {
4696 unsigned long value = *(unsigned long *) _p;
4697 if (value & ASCII_CHAR_MASK)
4698 break;
4699 _p += SIZEOF_LONG;
4700 }
4701 p = _p;
4702 if (_p == end)
4703 break;
4704 }
4705 if ((unsigned char)*p & 0x80)
4706 break;
4707 ++p;
4708 }
4709 memcpy(dest, start, p - start);
4710 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711}
Antoine Pitrouab868312009-01-10 15:40:25 +00004712
Victor Stinner785938e2011-12-11 20:09:03 +01004713PyObject *
4714PyUnicode_DecodeUTF8Stateful(const char *s,
4715 Py_ssize_t size,
4716 const char *errors,
4717 Py_ssize_t *consumed)
4718{
Victor Stinner785938e2011-12-11 20:09:03 +01004719 PyObject *unicode;
Victor Stinner785938e2011-12-11 20:09:03 +01004720 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004721 const char *end = s + size;
4722 Py_ssize_t outpos;
4723
4724 Py_ssize_t startinpos;
4725 Py_ssize_t endinpos;
4726 const char *errmsg = "";
4727 PyObject *errorHandler = NULL;
4728 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004729
4730 if (size == 0) {
4731 if (consumed)
4732 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004733 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004734 }
4735
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004736 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4737 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004738 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004739 *consumed = 1;
4740 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004741 }
4742
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004743 unicode = PyUnicode_New(size, 127);
Victor Stinner785938e2011-12-11 20:09:03 +01004744 if (!unicode)
4745 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004746
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004747 outpos = ascii_decode(s, end, PyUnicode_1BYTE_DATA(unicode));
4748 s += outpos;
4749 while (s < end) {
4750 Py_UCS4 ch;
4751 int kind = PyUnicode_KIND(unicode);
4752 if (kind == PyUnicode_1BYTE_KIND) {
4753 if (PyUnicode_IS_ASCII(unicode))
4754 ch = asciilib_utf8_decode(&s, end,
4755 PyUnicode_1BYTE_DATA(unicode), &outpos);
4756 else
4757 ch = ucs1lib_utf8_decode(&s, end,
4758 PyUnicode_1BYTE_DATA(unicode), &outpos);
4759 } else if (kind == PyUnicode_2BYTE_KIND) {
4760 ch = ucs2lib_utf8_decode(&s, end,
4761 PyUnicode_2BYTE_DATA(unicode), &outpos);
4762 } else {
4763 assert(kind == PyUnicode_4BYTE_KIND);
4764 ch = ucs4lib_utf8_decode(&s, end,
4765 PyUnicode_4BYTE_DATA(unicode), &outpos);
4766 }
4767
4768 switch (ch) {
4769 case 0:
4770 if (s == end || consumed)
4771 goto End;
4772 errmsg = "unexpected end of data";
4773 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004774 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004775 break;
4776 case 1:
4777 errmsg = "invalid start byte";
4778 startinpos = s - starts;
4779 endinpos = startinpos + 1;
4780 break;
4781 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004782 case 3:
4783 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004784 errmsg = "invalid continuation byte";
4785 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004786 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004787 break;
4788 default:
4789 if (unicode_putchar(&unicode, &outpos, ch) < 0)
4790 goto onError;
4791 continue;
4792 }
4793
4794 if (unicode_decode_call_errorhandler(
4795 errors, &errorHandler,
4796 "utf-8", errmsg,
4797 &starts, &end, &startinpos, &endinpos, &exc, &s,
4798 &unicode, &outpos))
4799 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004800 }
4801
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004802End:
4803 if (unicode_resize(&unicode, outpos) < 0)
4804 goto onError;
4805
4806 if (consumed)
4807 *consumed = s - starts;
4808
4809 Py_XDECREF(errorHandler);
4810 Py_XDECREF(exc);
4811 assert(_PyUnicode_CheckConsistency(unicode, 1));
4812 return unicode;
4813
4814onError:
4815 Py_XDECREF(errorHandler);
4816 Py_XDECREF(exc);
4817 Py_XDECREF(unicode);
4818 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004819}
4820
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004821#ifdef __APPLE__
4822
4823/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner27b1ca22012-12-03 12:47:59 +01004824 used to decode the command line arguments on Mac OS X.
4825
4826 Return a pointer to a newly allocated wide character string (use
4827 PyMem_Free() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004828
4829wchar_t*
4830_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4831{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004832 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004833 wchar_t *unicode;
4834 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004835
4836 /* Note: size will always be longer than the resulting Unicode
4837 character count */
Victor Stinner27b1ca22012-12-03 12:47:59 +01004838 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004839 return NULL;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004840 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4841 if (!unicode)
4842 return NULL;
4843
4844 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004845 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004846 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004847 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004848 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004849#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004850 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004851#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004852 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004853#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004854 if (ch > 0xFF) {
4855#if SIZEOF_WCHAR_T == 4
4856 assert(0);
4857#else
4858 assert(Py_UNICODE_IS_SURROGATE(ch));
4859 /* compute and append the two surrogates: */
4860 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4861 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4862#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004863 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004864 else {
4865 if (!ch && s == e)
4866 break;
4867 /* surrogateescape */
4868 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4869 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004870 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004871 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004872 return unicode;
4873}
4874
4875#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004877/* Primary internal function which creates utf8 encoded bytes objects.
4878
4879 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004880 and allocate exactly as much space needed at the end. Else allocate the
4881 maximum possible needed (4 result bytes per Unicode character), and return
4882 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004883*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004884PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004885_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886{
Victor Stinner6099a032011-12-18 14:22:26 +01004887 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004888 void *data;
4889 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004891 if (!PyUnicode_Check(unicode)) {
4892 PyErr_BadArgument();
4893 return NULL;
4894 }
4895
4896 if (PyUnicode_READY(unicode) == -1)
4897 return NULL;
4898
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004899 if (PyUnicode_UTF8(unicode))
4900 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4901 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004902
4903 kind = PyUnicode_KIND(unicode);
4904 data = PyUnicode_DATA(unicode);
4905 size = PyUnicode_GET_LENGTH(unicode);
4906
Benjamin Petersonead6b532011-12-20 17:23:42 -06004907 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004908 default:
4909 assert(0);
4910 case PyUnicode_1BYTE_KIND:
4911 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4912 assert(!PyUnicode_IS_ASCII(unicode));
4913 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4914 case PyUnicode_2BYTE_KIND:
4915 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4916 case PyUnicode_4BYTE_KIND:
4917 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919}
4920
Alexander Belopolsky40018472011-02-26 01:02:56 +00004921PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004922PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4923 Py_ssize_t size,
4924 const char *errors)
4925{
4926 PyObject *v, *unicode;
4927
4928 unicode = PyUnicode_FromUnicode(s, size);
4929 if (unicode == NULL)
4930 return NULL;
4931 v = _PyUnicode_AsUTF8String(unicode, errors);
4932 Py_DECREF(unicode);
4933 return v;
4934}
4935
4936PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004937PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004939 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940}
4941
Walter Dörwald41980ca2007-08-16 21:55:45 +00004942/* --- UTF-32 Codec ------------------------------------------------------- */
4943
4944PyObject *
4945PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 Py_ssize_t size,
4947 const char *errors,
4948 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004949{
4950 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4951}
4952
4953PyObject *
4954PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 Py_ssize_t size,
4956 const char *errors,
4957 int *byteorder,
4958 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004959{
4960 const char *starts = s;
4961 Py_ssize_t startinpos;
4962 Py_ssize_t endinpos;
4963 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004964 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004965 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004966 int bo = 0; /* assume native ordering by default */
4967 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004968 /* Offsets from q for retrieving bytes in the right order. */
4969#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4970 int iorder[] = {0, 1, 2, 3};
4971#else
4972 int iorder[] = {3, 2, 1, 0};
4973#endif
4974 PyObject *errorHandler = NULL;
4975 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004976
Walter Dörwald41980ca2007-08-16 21:55:45 +00004977 q = (unsigned char *)s;
4978 e = q + size;
4979
4980 if (byteorder)
4981 bo = *byteorder;
4982
4983 /* Check for BOM marks (U+FEFF) in the input and adjust current
4984 byte order setting accordingly. In native mode, the leading BOM
4985 mark is skipped, in all other modes, it is copied to the output
4986 stream as-is (giving a ZWNBSP character). */
4987 if (bo == 0) {
4988 if (size >= 4) {
4989 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004990 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004991#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004992 if (bom == 0x0000FEFF) {
4993 q += 4;
4994 bo = -1;
4995 }
4996 else if (bom == 0xFFFE0000) {
4997 q += 4;
4998 bo = 1;
4999 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005000#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005001 if (bom == 0x0000FEFF) {
5002 q += 4;
5003 bo = 1;
5004 }
5005 else if (bom == 0xFFFE0000) {
5006 q += 4;
5007 bo = -1;
5008 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005009#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005011 }
5012
5013 if (bo == -1) {
5014 /* force LE */
5015 iorder[0] = 0;
5016 iorder[1] = 1;
5017 iorder[2] = 2;
5018 iorder[3] = 3;
5019 }
5020 else if (bo == 1) {
5021 /* force BE */
5022 iorder[0] = 3;
5023 iorder[1] = 2;
5024 iorder[2] = 1;
5025 iorder[3] = 0;
5026 }
5027
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005028 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005029 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005030 if (!unicode)
5031 return NULL;
5032 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005033 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005034 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005035
Walter Dörwald41980ca2007-08-16 21:55:45 +00005036 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 Py_UCS4 ch;
5038 /* remaining bytes at the end? (size should be divisible by 4) */
5039 if (e-q<4) {
5040 if (consumed)
5041 break;
5042 errmsg = "truncated data";
5043 startinpos = ((const char *)q)-starts;
5044 endinpos = ((const char *)e)-starts;
5045 goto utf32Error;
5046 /* The remaining input chars are ignored if the callback
5047 chooses to skip the input */
5048 }
5049 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5050 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051
Benjamin Peterson29060642009-01-31 22:14:21 +00005052 if (ch >= 0x110000)
5053 {
5054 errmsg = "codepoint not in range(0x110000)";
5055 startinpos = ((const char *)q)-starts;
5056 endinpos = startinpos+4;
5057 goto utf32Error;
5058 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005059 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5060 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005061 q += 4;
5062 continue;
5063 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 if (unicode_decode_call_errorhandler(
5065 errors, &errorHandler,
5066 "utf32", errmsg,
5067 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005068 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005069 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005070 }
5071
5072 if (byteorder)
5073 *byteorder = bo;
5074
5075 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077
5078 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005079 if (unicode_resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005080 goto onError;
5081
5082 Py_XDECREF(errorHandler);
5083 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005084 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005085
Benjamin Peterson29060642009-01-31 22:14:21 +00005086 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005087 Py_DECREF(unicode);
5088 Py_XDECREF(errorHandler);
5089 Py_XDECREF(exc);
5090 return NULL;
5091}
5092
5093PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005094_PyUnicode_EncodeUTF32(PyObject *str,
5095 const char *errors,
5096 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005097{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005098 int kind;
5099 void *data;
5100 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005101 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005102 unsigned char *p;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005103 Py_ssize_t nsize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005104 /* Offsets from p for storing byte pairs in the right order. */
5105#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5106 int iorder[] = {0, 1, 2, 3};
5107#else
5108 int iorder[] = {3, 2, 1, 0};
5109#endif
5110
Benjamin Peterson29060642009-01-31 22:14:21 +00005111#define STORECHAR(CH) \
5112 do { \
5113 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5114 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5115 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5116 p[iorder[0]] = (CH) & 0xff; \
5117 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005118 } while(0)
5119
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005120 if (!PyUnicode_Check(str)) {
5121 PyErr_BadArgument();
5122 return NULL;
5123 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005124 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005125 return NULL;
5126 kind = PyUnicode_KIND(str);
5127 data = PyUnicode_DATA(str);
5128 len = PyUnicode_GET_LENGTH(str);
5129
5130 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005131 if (nsize > PY_SSIZE_T_MAX / 4)
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005133 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005134 if (v == NULL)
5135 return NULL;
5136
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005137 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005138 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005139 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005140 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005141 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005142
5143 if (byteorder == -1) {
5144 /* force LE */
5145 iorder[0] = 0;
5146 iorder[1] = 1;
5147 iorder[2] = 2;
5148 iorder[3] = 3;
5149 }
5150 else if (byteorder == 1) {
5151 /* force BE */
5152 iorder[0] = 3;
5153 iorder[1] = 2;
5154 iorder[2] = 1;
5155 iorder[3] = 0;
5156 }
5157
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005158 for (i = 0; i < len; i++)
5159 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005160
5161 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005162 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005163#undef STORECHAR
5164}
5165
Alexander Belopolsky40018472011-02-26 01:02:56 +00005166PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005167PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5168 Py_ssize_t size,
5169 const char *errors,
5170 int byteorder)
5171{
5172 PyObject *result;
5173 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5174 if (tmp == NULL)
5175 return NULL;
5176 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5177 Py_DECREF(tmp);
5178 return result;
5179}
5180
5181PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005182PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005183{
Victor Stinnerb960b342011-11-20 19:12:52 +01005184 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005185}
5186
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187/* --- UTF-16 Codec ------------------------------------------------------- */
5188
Tim Peters772747b2001-08-09 22:21:55 +00005189PyObject *
5190PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 Py_ssize_t size,
5192 const char *errors,
5193 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005194{
Walter Dörwald69652032004-09-07 20:24:22 +00005195 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5196}
5197
5198PyObject *
5199PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005200 Py_ssize_t size,
5201 const char *errors,
5202 int *byteorder,
5203 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005204{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005205 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005206 Py_ssize_t startinpos;
5207 Py_ssize_t endinpos;
5208 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005209 PyObject *unicode;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005210 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005211 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005212 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005213 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005214 PyObject *errorHandler = NULL;
5215 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216
Tim Peters772747b2001-08-09 22:21:55 +00005217 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005218 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219
5220 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005221 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005223 /* Check for BOM marks (U+FEFF) in the input and adjust current
5224 byte order setting accordingly. In native mode, the leading BOM
5225 mark is skipped, in all other modes, it is copied to the output
5226 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005227 if (bo == 0 && size >= 2) {
5228 const Py_UCS4 bom = (q[1] << 8) | q[0];
5229 if (bom == 0xFEFF) {
5230 q += 2;
5231 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005233 else if (bom == 0xFFFE) {
5234 q += 2;
5235 bo = 1;
5236 }
5237 if (byteorder)
5238 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005239 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240
Antoine Pitrou63065d72012-05-15 23:48:04 +02005241 if (q == e) {
5242 if (consumed)
5243 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005244 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005245 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005246
Antoine Pitrouab868312009-01-10 15:40:25 +00005247#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005248 native_ordering = bo <= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005249#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005250 native_ordering = bo >= 0;
Antoine Pitrouab868312009-01-10 15:40:25 +00005251#endif
Tim Peters772747b2001-08-09 22:21:55 +00005252
Antoine Pitrou63065d72012-05-15 23:48:04 +02005253 /* Note: size will always be longer than the resulting Unicode
5254 character count */
5255 unicode = PyUnicode_New((e - q + 1) / 2, 127);
5256 if (!unicode)
5257 return NULL;
5258
5259 outpos = 0;
5260 while (1) {
5261 Py_UCS4 ch = 0;
5262 if (e - q >= 2) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005263 int kind = PyUnicode_KIND(unicode);
Antoine Pitrou63065d72012-05-15 23:48:04 +02005264 if (kind == PyUnicode_1BYTE_KIND) {
5265 if (PyUnicode_IS_ASCII(unicode))
5266 ch = asciilib_utf16_decode(&q, e,
5267 PyUnicode_1BYTE_DATA(unicode), &outpos,
5268 native_ordering);
5269 else
5270 ch = ucs1lib_utf16_decode(&q, e,
5271 PyUnicode_1BYTE_DATA(unicode), &outpos,
5272 native_ordering);
5273 } else if (kind == PyUnicode_2BYTE_KIND) {
5274 ch = ucs2lib_utf16_decode(&q, e,
5275 PyUnicode_2BYTE_DATA(unicode), &outpos,
5276 native_ordering);
5277 } else {
5278 assert(kind == PyUnicode_4BYTE_KIND);
5279 ch = ucs4lib_utf16_decode(&q, e,
5280 PyUnicode_4BYTE_DATA(unicode), &outpos,
5281 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005282 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005283 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005284
Antoine Pitrou63065d72012-05-15 23:48:04 +02005285 switch (ch)
5286 {
5287 case 0:
5288 /* remaining byte at the end? (size should be even) */
5289 if (q == e || consumed)
5290 goto End;
5291 errmsg = "truncated data";
5292 startinpos = ((const char *)q) - starts;
5293 endinpos = ((const char *)e) - starts;
5294 break;
5295 /* The remaining input chars are ignored if the callback
5296 chooses to skip the input */
5297 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005298 q -= 2;
5299 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005300 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005301 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005302 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005303 endinpos = ((const char *)e) - starts;
5304 break;
5305 case 2:
5306 errmsg = "illegal encoding";
5307 startinpos = ((const char *)q) - 2 - starts;
5308 endinpos = startinpos + 2;
5309 break;
5310 case 3:
5311 errmsg = "illegal UTF-16 surrogate";
5312 startinpos = ((const char *)q) - 4 - starts;
5313 endinpos = startinpos + 2;
5314 break;
5315 default:
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005316 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5317 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005318 continue;
5319 }
5320
Benjamin Peterson29060642009-01-31 22:14:21 +00005321 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005322 errors,
5323 &errorHandler,
5324 "utf16", errmsg,
5325 &starts,
5326 (const char **)&e,
5327 &startinpos,
5328 &endinpos,
5329 &exc,
5330 (const char **)&q,
5331 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005332 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005333 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334 }
5335
Antoine Pitrou63065d72012-05-15 23:48:04 +02005336End:
Walter Dörwald69652032004-09-07 20:24:22 +00005337 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005338 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005339
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 /* Adjust length */
Victor Stinner16e6a802011-12-12 13:24:15 +01005341 if (unicode_resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342 goto onError;
5343
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005344 Py_XDECREF(errorHandler);
5345 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005346 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347
Benjamin Peterson29060642009-01-31 22:14:21 +00005348 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005350 Py_XDECREF(errorHandler);
5351 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 return NULL;
5353}
5354
Tim Peters772747b2001-08-09 22:21:55 +00005355PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005356_PyUnicode_EncodeUTF16(PyObject *str,
5357 const char *errors,
5358 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005360 enum PyUnicode_Kind kind;
5361 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005362 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005363 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005364 unsigned short *out;
5365 Py_ssize_t bytesize;
5366 Py_ssize_t pairs;
5367#ifdef WORDS_BIGENDIAN
5368 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005369#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005370 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005371#endif
5372
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005373 if (!PyUnicode_Check(str)) {
5374 PyErr_BadArgument();
5375 return NULL;
5376 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005377 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005378 return NULL;
5379 kind = PyUnicode_KIND(str);
5380 data = PyUnicode_DATA(str);
5381 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005382
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005383 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005384 if (kind == PyUnicode_4BYTE_KIND) {
5385 const Py_UCS4 *in = (const Py_UCS4 *)data;
5386 const Py_UCS4 *end = in + len;
5387 while (in < end)
5388 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005389 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005390 }
5391 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 return PyErr_NoMemory();
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005393 bytesize = (len + pairs + (byteorder == 0)) * 2;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005394 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 if (v == NULL)
5396 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005398 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005399 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005400 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005402 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005403 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005404 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005405
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005406 switch (kind) {
5407 case PyUnicode_1BYTE_KIND: {
5408 ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5409 break;
Tim Peters772747b2001-08-09 22:21:55 +00005410 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005411 case PyUnicode_2BYTE_KIND: {
5412 ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5413 break;
Tim Peters772747b2001-08-09 22:21:55 +00005414 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005415 case PyUnicode_4BYTE_KIND: {
5416 ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5417 break;
5418 }
5419 default:
5420 assert(0);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005421 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005422
5423 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005424 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425}
5426
Alexander Belopolsky40018472011-02-26 01:02:56 +00005427PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005428PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5429 Py_ssize_t size,
5430 const char *errors,
5431 int byteorder)
5432{
5433 PyObject *result;
5434 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5435 if (tmp == NULL)
5436 return NULL;
5437 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5438 Py_DECREF(tmp);
5439 return result;
5440}
5441
5442PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005443PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005445 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446}
5447
5448/* --- Unicode Escape Codec ----------------------------------------------- */
5449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005450/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5451 if all the escapes in the string make it still a valid ASCII string.
5452 Returns -1 if any escapes were found which cause the string to
5453 pop out of ASCII range. Otherwise returns the length of the
5454 required buffer to hold the string.
5455 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005456static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005457length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5458{
5459 const unsigned char *p = (const unsigned char *)s;
5460 const unsigned char *end = p + size;
5461 Py_ssize_t length = 0;
5462
5463 if (size < 0)
5464 return -1;
5465
5466 for (; p < end; ++p) {
5467 if (*p > 127) {
5468 /* Non-ASCII */
5469 return -1;
5470 }
5471 else if (*p != '\\') {
5472 /* Normal character */
5473 ++length;
5474 }
5475 else {
5476 /* Backslash-escape, check next char */
5477 ++p;
5478 /* Escape sequence reaches till end of string or
5479 non-ASCII follow-up. */
5480 if (p >= end || *p > 127)
5481 return -1;
5482 switch (*p) {
5483 case '\n':
5484 /* backslash + \n result in zero characters */
5485 break;
5486 case '\\': case '\'': case '\"':
5487 case 'b': case 'f': case 't':
5488 case 'n': case 'r': case 'v': case 'a':
5489 ++length;
5490 break;
5491 case '0': case '1': case '2': case '3':
5492 case '4': case '5': case '6': case '7':
5493 case 'x': case 'u': case 'U': case 'N':
5494 /* these do not guarantee ASCII characters */
5495 return -1;
5496 default:
5497 /* count the backslash + the other character */
5498 length += 2;
5499 }
5500 }
5501 }
5502 return length;
5503}
5504
Fredrik Lundh06d12682001-01-24 07:59:11 +00005505static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005506
Alexander Belopolsky40018472011-02-26 01:02:56 +00005507PyObject *
5508PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005509 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005510 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005512 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005513 Py_ssize_t startinpos;
5514 Py_ssize_t endinpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005515 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005517 char* message;
5518 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005519 PyObject *errorHandler = NULL;
5520 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005521 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005522 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005523
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005524 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005525
5526 /* After length_of_escaped_ascii_string() there are two alternatives,
5527 either the string is pure ASCII with named escapes like \n, etc.
5528 and we determined it's exact size (common case)
5529 or it contains \x, \u, ... escape sequences. then we create a
5530 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005531 if (len >= 0) {
5532 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005533 if (!v)
5534 goto onError;
5535 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005536 }
5537 else {
5538 /* Escaped strings will always be longer than the resulting
5539 Unicode string, so we start with size here and then reduce the
5540 length after conversion to the true value.
5541 (but if the error callback returns a long replacement string
5542 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005543 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005544 if (!v)
5545 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005546 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005547 }
5548
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005550 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005551 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005553
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554 while (s < end) {
5555 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005556 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005557 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005559 /* The only case in which i == ascii_length is a backslash
5560 followed by a newline. */
5561 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005562
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563 /* Non-escape characters are interpreted as Unicode ordinals */
5564 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005565 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5566 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005567 continue;
5568 }
5569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005570 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 /* \ - Escapes */
5572 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005573 c = *s++;
5574 if (s > end)
5575 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005576
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005577 /* The only case in which i == ascii_length is a backslash
5578 followed by a newline. */
5579 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005580
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005581 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582
Benjamin Peterson29060642009-01-31 22:14:21 +00005583 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005584#define WRITECHAR(ch) \
5585 do { \
5586 if (unicode_putchar(&v, &i, ch) < 0) \
5587 goto onError; \
5588 }while(0)
5589
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005591 case '\\': WRITECHAR('\\'); break;
5592 case '\'': WRITECHAR('\''); break;
5593 case '\"': WRITECHAR('\"'); break;
5594 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005595 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005596 case 'f': WRITECHAR('\014'); break;
5597 case 't': WRITECHAR('\t'); break;
5598 case 'n': WRITECHAR('\n'); break;
5599 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005600 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005601 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005602 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005603 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604
Benjamin Peterson29060642009-01-31 22:14:21 +00005605 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606 case '0': case '1': case '2': case '3':
5607 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005608 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005609 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005610 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005611 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005612 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005613 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005614 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 break;
5616
Benjamin Peterson29060642009-01-31 22:14:21 +00005617 /* hex escapes */
5618 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005620 digits = 2;
5621 message = "truncated \\xXX escape";
5622 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623
Benjamin Peterson29060642009-01-31 22:14:21 +00005624 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005625 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005626 digits = 4;
5627 message = "truncated \\uXXXX escape";
5628 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629
Benjamin Peterson29060642009-01-31 22:14:21 +00005630 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005631 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005632 digits = 8;
5633 message = "truncated \\UXXXXXXXX escape";
5634 hexescape:
5635 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005636 if (end - s < digits) {
5637 /* count only hex digits */
5638 for (; s < end; ++s) {
5639 c = (unsigned char)*s;
5640 if (!Py_ISXDIGIT(c))
5641 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005642 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005643 goto error;
5644 }
5645 for (; digits--; ++s) {
5646 c = (unsigned char)*s;
5647 if (!Py_ISXDIGIT(c))
5648 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005649 chr = (chr<<4) & ~0xF;
5650 if (c >= '0' && c <= '9')
5651 chr += c - '0';
5652 else if (c >= 'a' && c <= 'f')
5653 chr += 10 + c - 'a';
5654 else
5655 chr += 10 + c - 'A';
5656 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005657 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005658 /* _decoding_error will have already written into the
5659 target buffer. */
5660 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005661 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005662 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005663 message = "illegal Unicode character";
5664 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005665 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005666 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005667 break;
5668
Benjamin Peterson29060642009-01-31 22:14:21 +00005669 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005670 case 'N':
5671 message = "malformed \\N character escape";
5672 if (ucnhash_CAPI == NULL) {
5673 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005674 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5675 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005676 if (ucnhash_CAPI == NULL)
5677 goto ucnhashError;
5678 }
5679 if (*s == '{') {
5680 const char *start = s+1;
5681 /* look for the closing brace */
5682 while (*s != '}' && s < end)
5683 s++;
5684 if (s > start && s < end && *s == '}') {
5685 /* found a name. look it up in the unicode database */
5686 message = "unknown Unicode character name";
5687 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005688 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005689 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005690 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005691 goto store;
5692 }
5693 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005694 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005695
5696 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005697 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005698 message = "\\ at end of string";
5699 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005700 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005701 }
5702 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005703 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005704 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005705 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005706 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005708 continue;
5709
5710 error:
5711 endinpos = s-starts;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005712 if (unicode_decode_call_errorhandler(
5713 errors, &errorHandler,
5714 "unicodeescape", message,
5715 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005716 &v, &i))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005717 goto onError;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005718 len = PyUnicode_GET_LENGTH(v);
Serhiy Storchakad6793772013-01-29 10:20:44 +02005719 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005721#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005722
Victor Stinner16e6a802011-12-12 13:24:15 +01005723 if (unicode_resize(&v, i) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005724 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005725 Py_XDECREF(errorHandler);
5726 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005727 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005728
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005730 PyErr_SetString(
5731 PyExc_UnicodeError,
5732 "\\N escapes not supported (can't load unicodedata module)"
5733 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005734 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 Py_XDECREF(errorHandler);
5736 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005737 return NULL;
5738
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005741 Py_XDECREF(errorHandler);
5742 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 return NULL;
5744}
5745
5746/* Return a Unicode-Escape string version of the Unicode object.
5747
5748 If quotes is true, the string is enclosed in u"" or u'' quotes as
5749 appropriate.
5750
5751*/
5752
Alexander Belopolsky40018472011-02-26 01:02:56 +00005753PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005754PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005756 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005757 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005759 int kind;
5760 void *data;
5761 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762
Ezio Melottie7f90372012-10-05 03:33:31 +03005763 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005764 escape.
5765
Ezio Melottie7f90372012-10-05 03:33:31 +03005766 For UCS1 strings it's '\xxx', 4 bytes per source character.
5767 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5768 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005769 */
5770
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005771 if (!PyUnicode_Check(unicode)) {
5772 PyErr_BadArgument();
5773 return NULL;
5774 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005775 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005776 return NULL;
5777 len = PyUnicode_GET_LENGTH(unicode);
5778 kind = PyUnicode_KIND(unicode);
5779 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005780 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005781 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5782 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5783 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5784 }
5785
5786 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005787 return PyBytes_FromStringAndSize(NULL, 0);
5788
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005789 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005790 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005791
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005792 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005793 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005794 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005795 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796 if (repr == NULL)
5797 return NULL;
5798
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005799 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005800
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005801 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005802 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005803
Walter Dörwald79e913e2007-05-12 11:08:06 +00005804 /* Escape backslashes */
5805 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005806 *p++ = '\\';
5807 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005808 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005809 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005810
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005811 /* Map 21-bit characters to '\U00xxxxxx' */
5812 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005813 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005814 *p++ = '\\';
5815 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005816 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5817 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5818 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5819 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5820 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5821 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5822 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5823 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005825 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005826
Guido van Rossumd57fd912000-03-10 22:53:23 +00005827 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005828 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005829 *p++ = '\\';
5830 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005831 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5832 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5833 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5834 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005835 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005836
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005837 /* Map special whitespace to '\t', \n', '\r' */
5838 else if (ch == '\t') {
5839 *p++ = '\\';
5840 *p++ = 't';
5841 }
5842 else if (ch == '\n') {
5843 *p++ = '\\';
5844 *p++ = 'n';
5845 }
5846 else if (ch == '\r') {
5847 *p++ = '\\';
5848 *p++ = 'r';
5849 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005850
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005851 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005852 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005854 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005855 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5856 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005857 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005858
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 /* Copy everything else as-is */
5860 else
5861 *p++ = (char) ch;
5862 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005864 assert(p - PyBytes_AS_STRING(repr) > 0);
5865 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5866 return NULL;
5867 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868}
5869
Alexander Belopolsky40018472011-02-26 01:02:56 +00005870PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005871PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5872 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005874 PyObject *result;
5875 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5876 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005878 result = PyUnicode_AsUnicodeEscapeString(tmp);
5879 Py_DECREF(tmp);
5880 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881}
5882
5883/* --- Raw Unicode Escape Codec ------------------------------------------- */
5884
Alexander Belopolsky40018472011-02-26 01:02:56 +00005885PyObject *
5886PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005887 Py_ssize_t size,
5888 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005889{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005890 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005891 Py_ssize_t startinpos;
5892 Py_ssize_t endinpos;
5893 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005894 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 const char *end;
5896 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005897 PyObject *errorHandler = NULL;
5898 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005899
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900 /* Escaped strings will always be longer than the resulting
5901 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005902 length after conversion to the true value. (But decoding error
5903 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005904 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005906 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005907 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005908 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005909 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 end = s + size;
5911 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005912 unsigned char c;
5913 Py_UCS4 x;
5914 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005915 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 /* Non-escape characters are interpreted as Unicode ordinals */
5918 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005919 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5920 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005921 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005922 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 startinpos = s-starts;
5924
5925 /* \u-escapes are only interpreted iff the number of leading
5926 backslashes if odd */
5927 bs = s;
5928 for (;s < end;) {
5929 if (*s != '\\')
5930 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005931 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
5932 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005933 }
5934 if (((s - bs) & 1) == 0 ||
5935 s >= end ||
5936 (*s != 'u' && *s != 'U')) {
5937 continue;
5938 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005939 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00005940 count = *s=='u' ? 4 : 8;
5941 s++;
5942
5943 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00005944 for (x = 0, i = 0; i < count; ++i, ++s) {
5945 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005946 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005947 endinpos = s-starts;
5948 if (unicode_decode_call_errorhandler(
5949 errors, &errorHandler,
5950 "rawunicodeescape", "truncated \\uXXXX",
5951 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005952 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005953 goto onError;
5954 goto nextByte;
5955 }
5956 x = (x<<4) & ~0xF;
5957 if (c >= '0' && c <= '9')
5958 x += c - '0';
5959 else if (c >= 'a' && c <= 'f')
5960 x += 10 + c - 'a';
5961 else
5962 x += 10 + c - 'A';
5963 }
Victor Stinner8faf8212011-12-08 22:14:11 +01005964 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005965 if (unicode_putchar(&v, &outpos, x) < 0)
5966 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005967 } else {
5968 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005969 if (unicode_decode_call_errorhandler(
5970 errors, &errorHandler,
5971 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005972 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005973 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005975 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005976 nextByte:
5977 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 }
Victor Stinner16e6a802011-12-12 13:24:15 +01005979 if (unicode_resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005981 Py_XDECREF(errorHandler);
5982 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005983 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00005984
Benjamin Peterson29060642009-01-31 22:14:21 +00005985 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005987 Py_XDECREF(errorHandler);
5988 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 return NULL;
5990}
5991
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005992
Alexander Belopolsky40018472011-02-26 01:02:56 +00005993PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005994PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005996 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 char *p;
5998 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005999 Py_ssize_t expandsize, pos;
6000 int kind;
6001 void *data;
6002 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006004 if (!PyUnicode_Check(unicode)) {
6005 PyErr_BadArgument();
6006 return NULL;
6007 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006008 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006009 return NULL;
6010 kind = PyUnicode_KIND(unicode);
6011 data = PyUnicode_DATA(unicode);
6012 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006013 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6014 bytes, and 1 byte characters 4. */
6015 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006016
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006017 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006019
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006020 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 if (repr == NULL)
6022 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006023 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006024 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006026 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006027 for (pos = 0; pos < len; pos++) {
6028 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006029 /* Map 32-bit characters to '\Uxxxxxxxx' */
6030 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006031 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006032 *p++ = '\\';
6033 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006034 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6035 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6036 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6037 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6038 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6039 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6040 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6041 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006042 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006044 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 *p++ = '\\';
6046 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006047 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6048 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6049 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6050 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 /* Copy everything else as-is */
6053 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054 *p++ = (char) ch;
6055 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006056
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006057 assert(p > q);
6058 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006059 return NULL;
6060 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006061}
6062
Alexander Belopolsky40018472011-02-26 01:02:56 +00006063PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006064PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6065 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006067 PyObject *result;
6068 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6069 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006070 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006071 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6072 Py_DECREF(tmp);
6073 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074}
6075
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006076/* --- Unicode Internal Codec ------------------------------------------- */
6077
Alexander Belopolsky40018472011-02-26 01:02:56 +00006078PyObject *
6079_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006080 Py_ssize_t size,
6081 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006082{
6083 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006084 Py_ssize_t startinpos;
6085 Py_ssize_t endinpos;
6086 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006087 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006088 const char *end;
6089 const char *reason;
6090 PyObject *errorHandler = NULL;
6091 PyObject *exc = NULL;
6092
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006093 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006094 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006095 1))
6096 return NULL;
6097
Thomas Wouters89f507f2006-12-13 04:49:30 +00006098 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006099 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006100 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006102 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006103 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006104 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006105 end = s + size;
6106
6107 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006108 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006109 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006110 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006111 endinpos = end-starts;
6112 reason = "truncated input";
6113 goto error;
6114 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006115 /* We copy the raw representation one byte at a time because the
6116 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006117 ((char *) &uch)[0] = s[0];
6118 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006119#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006120 ((char *) &uch)[2] = s[2];
6121 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006122#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006123 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006124#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006125 /* We have to sanity check the raw data, otherwise doom looms for
6126 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006127 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006128 endinpos = s - starts + Py_UNICODE_SIZE;
6129 reason = "illegal code point (> 0x10FFFF)";
6130 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006131 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006132#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006133 s += Py_UNICODE_SIZE;
6134#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006135 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006136 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006137 Py_UNICODE uch2;
6138 ((char *) &uch2)[0] = s[0];
6139 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006140 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006141 {
Victor Stinner551ac952011-11-29 22:58:13 +01006142 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006143 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006144 }
6145 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006146#endif
6147
6148 if (unicode_putchar(&v, &outpos, ch) < 0)
6149 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006150 continue;
6151
6152 error:
6153 startinpos = s - starts;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006154 if (unicode_decode_call_errorhandler(
6155 errors, &errorHandler,
6156 "unicode_internal", reason,
6157 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006158 &v, &outpos))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006159 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006160 }
6161
Victor Stinner16e6a802011-12-12 13:24:15 +01006162 if (unicode_resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006163 goto onError;
6164 Py_XDECREF(errorHandler);
6165 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006166 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006167
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006169 Py_XDECREF(v);
6170 Py_XDECREF(errorHandler);
6171 Py_XDECREF(exc);
6172 return NULL;
6173}
6174
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175/* --- Latin-1 Codec ------------------------------------------------------ */
6176
Alexander Belopolsky40018472011-02-26 01:02:56 +00006177PyObject *
6178PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006179 Py_ssize_t size,
6180 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006182 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006183 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184}
6185
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006186/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006187static void
6188make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006189 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006190 PyObject *unicode,
6191 Py_ssize_t startpos, Py_ssize_t endpos,
6192 const char *reason)
6193{
6194 if (*exceptionObject == NULL) {
6195 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006196 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006197 encoding, unicode, startpos, endpos, reason);
6198 }
6199 else {
6200 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6201 goto onError;
6202 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6203 goto onError;
6204 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6205 goto onError;
6206 return;
6207 onError:
6208 Py_DECREF(*exceptionObject);
6209 *exceptionObject = NULL;
6210 }
6211}
6212
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006213/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006214static void
6215raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006216 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006217 PyObject *unicode,
6218 Py_ssize_t startpos, Py_ssize_t endpos,
6219 const char *reason)
6220{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006221 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006222 encoding, unicode, startpos, endpos, reason);
6223 if (*exceptionObject != NULL)
6224 PyCodec_StrictErrors(*exceptionObject);
6225}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006226
6227/* error handling callback helper:
6228 build arguments, call the callback and check the arguments,
6229 put the result into newpos and return the replacement string, which
6230 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006231static PyObject *
6232unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006233 PyObject **errorHandler,
6234 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006235 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006236 Py_ssize_t startpos, Py_ssize_t endpos,
6237 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006238{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006239 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006240 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006241 PyObject *restuple;
6242 PyObject *resunicode;
6243
6244 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006246 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006247 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006248 }
6249
Benjamin Petersonbac79492012-01-14 13:34:47 -05006250 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006251 return NULL;
6252 len = PyUnicode_GET_LENGTH(unicode);
6253
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006254 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006255 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006256 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006258
6259 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006260 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006261 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006263 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006264 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006265 Py_DECREF(restuple);
6266 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006267 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006268 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 &resunicode, newpos)) {
6270 Py_DECREF(restuple);
6271 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006272 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006273 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6274 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6275 Py_DECREF(restuple);
6276 return NULL;
6277 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006278 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006279 *newpos = len + *newpos;
6280 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6282 Py_DECREF(restuple);
6283 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006284 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006285 Py_INCREF(resunicode);
6286 Py_DECREF(restuple);
6287 return resunicode;
6288}
6289
Alexander Belopolsky40018472011-02-26 01:02:56 +00006290static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006291unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006292 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006293 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006294{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006295 /* input state */
6296 Py_ssize_t pos=0, size;
6297 int kind;
6298 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006299 /* output object */
6300 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006301 /* pointer into the output */
6302 char *str;
6303 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006304 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006305 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6306 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006307 PyObject *errorHandler = NULL;
6308 PyObject *exc = NULL;
6309 /* the following variable is used for caching string comparisons
6310 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6311 int known_errorHandler = -1;
6312
Benjamin Petersonbac79492012-01-14 13:34:47 -05006313 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006314 return NULL;
6315 size = PyUnicode_GET_LENGTH(unicode);
6316 kind = PyUnicode_KIND(unicode);
6317 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006318 /* allocate enough for a simple encoding without
6319 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006320 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006321 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006322 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006323 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006324 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006325 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006326 ressize = size;
6327
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006328 while (pos < size) {
6329 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006330
Benjamin Peterson29060642009-01-31 22:14:21 +00006331 /* can we encode this? */
6332 if (c<limit) {
6333 /* no overflow check, because we know that the space is enough */
6334 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006335 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006336 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006337 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006338 Py_ssize_t requiredsize;
6339 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006340 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006341 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006342 Py_ssize_t collstart = pos;
6343 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006344 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006345 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006346 ++collend;
6347 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6348 if (known_errorHandler==-1) {
6349 if ((errors==NULL) || (!strcmp(errors, "strict")))
6350 known_errorHandler = 1;
6351 else if (!strcmp(errors, "replace"))
6352 known_errorHandler = 2;
6353 else if (!strcmp(errors, "ignore"))
6354 known_errorHandler = 3;
6355 else if (!strcmp(errors, "xmlcharrefreplace"))
6356 known_errorHandler = 4;
6357 else
6358 known_errorHandler = 0;
6359 }
6360 switch (known_errorHandler) {
6361 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006362 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 goto onError;
6364 case 2: /* replace */
6365 while (collstart++<collend)
6366 *str++ = '?'; /* fall through */
6367 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006368 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 break;
6370 case 4: /* xmlcharrefreplace */
6371 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006372 /* determine replacement size */
6373 for (i = collstart, repsize = 0; i < collend; ++i) {
6374 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6375 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006377 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006378 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006379 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006380 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006381 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006383 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006385 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006387 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006388 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006390 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006391 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006392 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006393 if (requiredsize > ressize) {
6394 if (requiredsize<2*ressize)
6395 requiredsize = 2*ressize;
6396 if (_PyBytes_Resize(&res, requiredsize))
6397 goto onError;
6398 str = PyBytes_AS_STRING(res) + respos;
6399 ressize = requiredsize;
6400 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006401 /* generate replacement */
6402 for (i = collstart; i < collend; ++i) {
6403 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006404 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006405 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 break;
6407 default:
6408 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006409 encoding, reason, unicode, &exc,
6410 collstart, collend, &newpos);
6411 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006412 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006413 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006414 if (PyBytes_Check(repunicode)) {
6415 /* Directly copy bytes result to output. */
6416 repsize = PyBytes_Size(repunicode);
6417 if (repsize > 1) {
6418 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006419 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006420 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6421 Py_DECREF(repunicode);
6422 goto onError;
6423 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006424 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006425 ressize += repsize-1;
6426 }
6427 memcpy(str, PyBytes_AsString(repunicode), repsize);
6428 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006429 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006430 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006431 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006432 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 /* need more space? (at least enough for what we
6434 have+the replacement+the rest of the string, so
6435 we won't have to check space for encodable characters) */
6436 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006437 repsize = PyUnicode_GET_LENGTH(repunicode);
6438 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006439 if (requiredsize > ressize) {
6440 if (requiredsize<2*ressize)
6441 requiredsize = 2*ressize;
6442 if (_PyBytes_Resize(&res, requiredsize)) {
6443 Py_DECREF(repunicode);
6444 goto onError;
6445 }
6446 str = PyBytes_AS_STRING(res) + respos;
6447 ressize = requiredsize;
6448 }
6449 /* check if there is anything unencodable in the replacement
6450 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006451 for (i = 0; repsize-->0; ++i, ++str) {
6452 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006454 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006455 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006456 Py_DECREF(repunicode);
6457 goto onError;
6458 }
6459 *str = (char)c;
6460 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006461 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006462 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006463 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006464 }
6465 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006466 /* Resize if we allocated to much */
6467 size = str - PyBytes_AS_STRING(res);
6468 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006469 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006470 if (_PyBytes_Resize(&res, size) < 0)
6471 goto onError;
6472 }
6473
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006474 Py_XDECREF(errorHandler);
6475 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006476 return res;
6477
6478 onError:
6479 Py_XDECREF(res);
6480 Py_XDECREF(errorHandler);
6481 Py_XDECREF(exc);
6482 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483}
6484
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006485/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006486PyObject *
6487PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006488 Py_ssize_t size,
6489 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006491 PyObject *result;
6492 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6493 if (unicode == NULL)
6494 return NULL;
6495 result = unicode_encode_ucs1(unicode, errors, 256);
6496 Py_DECREF(unicode);
6497 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498}
6499
Alexander Belopolsky40018472011-02-26 01:02:56 +00006500PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006501_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502{
6503 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 PyErr_BadArgument();
6505 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006506 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006507 if (PyUnicode_READY(unicode) == -1)
6508 return NULL;
6509 /* Fast path: if it is a one-byte string, construct
6510 bytes object directly. */
6511 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6512 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6513 PyUnicode_GET_LENGTH(unicode));
6514 /* Non-Latin-1 characters present. Defer to above function to
6515 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006516 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006517}
6518
6519PyObject*
6520PyUnicode_AsLatin1String(PyObject *unicode)
6521{
6522 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523}
6524
6525/* --- 7-bit ASCII Codec -------------------------------------------------- */
6526
Alexander Belopolsky40018472011-02-26 01:02:56 +00006527PyObject *
6528PyUnicode_DecodeASCII(const char *s,
6529 Py_ssize_t size,
6530 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006532 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006533 PyObject *unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006534 int kind;
6535 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006536 Py_ssize_t startinpos;
6537 Py_ssize_t endinpos;
6538 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006539 const char *e;
6540 PyObject *errorHandler = NULL;
6541 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006542
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006544 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006545
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006547 if (size == 1 && (unsigned char)s[0] < 128)
6548 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006549
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006550 unicode = PyUnicode_New(size, 127);
6551 if (unicode == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006553
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006554 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006555 data = PyUnicode_1BYTE_DATA(unicode);
6556 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6557 if (outpos == size)
6558 return unicode;
6559
6560 s += outpos;
6561 kind = PyUnicode_1BYTE_KIND;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006562 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 register unsigned char c = (unsigned char)*s;
6564 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006565 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 ++s;
6567 }
6568 else {
6569 startinpos = s-starts;
6570 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 if (unicode_decode_call_errorhandler(
6572 errors, &errorHandler,
6573 "ascii", "ordinal not in range(128)",
6574 &starts, &e, &startinpos, &endinpos, &exc, &s,
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006575 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 goto onError;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006577 kind = PyUnicode_KIND(unicode);
6578 data = PyUnicode_DATA(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00006579 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006581 if (unicode_resize(&unicode, outpos) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006582 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006583 Py_XDECREF(errorHandler);
6584 Py_XDECREF(exc);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006585 assert(_PyUnicode_CheckConsistency(unicode, 1));
6586 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00006587
Benjamin Peterson29060642009-01-31 22:14:21 +00006588 onError:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006589 Py_XDECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006590 Py_XDECREF(errorHandler);
6591 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 return NULL;
6593}
6594
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006595/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006596PyObject *
6597PyUnicode_EncodeASCII(const Py_UNICODE *p,
6598 Py_ssize_t size,
6599 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006601 PyObject *result;
6602 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6603 if (unicode == NULL)
6604 return NULL;
6605 result = unicode_encode_ucs1(unicode, errors, 128);
6606 Py_DECREF(unicode);
6607 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608}
6609
Alexander Belopolsky40018472011-02-26 01:02:56 +00006610PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006611_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612{
6613 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006614 PyErr_BadArgument();
6615 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006617 if (PyUnicode_READY(unicode) == -1)
6618 return NULL;
6619 /* Fast path: if it is an ASCII-only string, construct bytes object
6620 directly. Else defer to above function to raise the exception. */
6621 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6622 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6623 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006624 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006625}
6626
6627PyObject *
6628PyUnicode_AsASCIIString(PyObject *unicode)
6629{
6630 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631}
6632
Victor Stinner99b95382011-07-04 14:23:54 +02006633#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006634
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006635/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006636
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006637#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006638#define NEED_RETRY
6639#endif
6640
Victor Stinner3a50e702011-10-18 21:21:00 +02006641#ifndef WC_ERR_INVALID_CHARS
6642# define WC_ERR_INVALID_CHARS 0x0080
6643#endif
6644
6645static char*
6646code_page_name(UINT code_page, PyObject **obj)
6647{
6648 *obj = NULL;
6649 if (code_page == CP_ACP)
6650 return "mbcs";
6651 if (code_page == CP_UTF7)
6652 return "CP_UTF7";
6653 if (code_page == CP_UTF8)
6654 return "CP_UTF8";
6655
6656 *obj = PyBytes_FromFormat("cp%u", code_page);
6657 if (*obj == NULL)
6658 return NULL;
6659 return PyBytes_AS_STRING(*obj);
6660}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006661
Alexander Belopolsky40018472011-02-26 01:02:56 +00006662static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006663is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006664{
6665 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006666 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006667
Victor Stinner3a50e702011-10-18 21:21:00 +02006668 if (!IsDBCSLeadByteEx(code_page, *curr))
6669 return 0;
6670
6671 prev = CharPrevExA(code_page, s, curr, 0);
6672 if (prev == curr)
6673 return 1;
6674 /* FIXME: This code is limited to "true" double-byte encodings,
6675 as it assumes an incomplete character consists of a single
6676 byte. */
6677 if (curr - prev == 2)
6678 return 1;
6679 if (!IsDBCSLeadByteEx(code_page, *prev))
6680 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006681 return 0;
6682}
6683
Victor Stinner3a50e702011-10-18 21:21:00 +02006684static DWORD
6685decode_code_page_flags(UINT code_page)
6686{
6687 if (code_page == CP_UTF7) {
6688 /* The CP_UTF7 decoder only supports flags=0 */
6689 return 0;
6690 }
6691 else
6692 return MB_ERR_INVALID_CHARS;
6693}
6694
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006695/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006696 * Decode a byte string from a Windows code page into unicode object in strict
6697 * mode.
6698 *
6699 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6700 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006701 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006702static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006703decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006704 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006705 const char *in,
6706 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006707{
Victor Stinner3a50e702011-10-18 21:21:00 +02006708 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006709 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006710 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006711
6712 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006713 assert(insize > 0);
6714 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6715 if (outsize <= 0)
6716 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006717
6718 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006720 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006721 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006722 if (*v == NULL)
6723 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006724 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006725 }
6726 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006728 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006729 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006731 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006732 }
6733
6734 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006735 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6736 if (outsize <= 0)
6737 goto error;
6738 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006739
Victor Stinner3a50e702011-10-18 21:21:00 +02006740error:
6741 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6742 return -2;
6743 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006744 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006745}
6746
Victor Stinner3a50e702011-10-18 21:21:00 +02006747/*
6748 * Decode a byte string from a code page into unicode object with an error
6749 * handler.
6750 *
6751 * Returns consumed size if succeed, or raise a WindowsError or
6752 * UnicodeDecodeError exception and returns -1 on error.
6753 */
6754static int
6755decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006756 PyObject **v,
6757 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006758 const char *errors)
6759{
6760 const char *startin = in;
6761 const char *endin = in + size;
6762 const DWORD flags = decode_code_page_flags(code_page);
6763 /* Ideally, we should get reason from FormatMessage. This is the Windows
6764 2000 English version of the message. */
6765 const char *reason = "No mapping for the Unicode character exists "
6766 "in the target code page.";
6767 /* each step cannot decode more than 1 character, but a character can be
6768 represented as a surrogate pair */
6769 wchar_t buffer[2], *startout, *out;
6770 int insize, outsize;
6771 PyObject *errorHandler = NULL;
6772 PyObject *exc = NULL;
6773 PyObject *encoding_obj = NULL;
6774 char *encoding;
6775 DWORD err;
6776 int ret = -1;
6777
6778 assert(size > 0);
6779
6780 encoding = code_page_name(code_page, &encoding_obj);
6781 if (encoding == NULL)
6782 return -1;
6783
6784 if (errors == NULL || strcmp(errors, "strict") == 0) {
6785 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6786 UnicodeDecodeError. */
6787 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6788 if (exc != NULL) {
6789 PyCodec_StrictErrors(exc);
6790 Py_CLEAR(exc);
6791 }
6792 goto error;
6793 }
6794
6795 if (*v == NULL) {
6796 /* Create unicode object */
6797 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6798 PyErr_NoMemory();
6799 goto error;
6800 }
Victor Stinnerab595942011-12-17 04:59:06 +01006801 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006802 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006803 if (*v == NULL)
6804 goto error;
6805 startout = PyUnicode_AS_UNICODE(*v);
6806 }
6807 else {
6808 /* Extend unicode object */
6809 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6810 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6811 PyErr_NoMemory();
6812 goto error;
6813 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006814 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006815 goto error;
6816 startout = PyUnicode_AS_UNICODE(*v) + n;
6817 }
6818
6819 /* Decode the byte string character per character */
6820 out = startout;
6821 while (in < endin)
6822 {
6823 /* Decode a character */
6824 insize = 1;
6825 do
6826 {
6827 outsize = MultiByteToWideChar(code_page, flags,
6828 in, insize,
6829 buffer, Py_ARRAY_LENGTH(buffer));
6830 if (outsize > 0)
6831 break;
6832 err = GetLastError();
6833 if (err != ERROR_NO_UNICODE_TRANSLATION
6834 && err != ERROR_INSUFFICIENT_BUFFER)
6835 {
6836 PyErr_SetFromWindowsErr(0);
6837 goto error;
6838 }
6839 insize++;
6840 }
6841 /* 4=maximum length of a UTF-8 sequence */
6842 while (insize <= 4 && (in + insize) <= endin);
6843
6844 if (outsize <= 0) {
6845 Py_ssize_t startinpos, endinpos, outpos;
6846
6847 startinpos = in - startin;
6848 endinpos = startinpos + 1;
6849 outpos = out - PyUnicode_AS_UNICODE(*v);
6850 if (unicode_decode_call_errorhandler(
6851 errors, &errorHandler,
6852 encoding, reason,
6853 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006854 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006855 {
6856 goto error;
6857 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006858 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006859 }
6860 else {
6861 in += insize;
6862 memcpy(out, buffer, outsize * sizeof(wchar_t));
6863 out += outsize;
6864 }
6865 }
6866
6867 /* write a NUL character at the end */
6868 *out = 0;
6869
6870 /* Extend unicode object */
6871 outsize = out - startout;
6872 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006873 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006874 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01006875 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02006876
6877error:
6878 Py_XDECREF(encoding_obj);
6879 Py_XDECREF(errorHandler);
6880 Py_XDECREF(exc);
6881 return ret;
6882}
6883
Victor Stinner3a50e702011-10-18 21:21:00 +02006884static PyObject *
6885decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006886 const char *s, Py_ssize_t size,
6887 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006888{
Victor Stinner76a31a62011-11-04 00:05:13 +01006889 PyObject *v = NULL;
6890 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006891
Victor Stinner3a50e702011-10-18 21:21:00 +02006892 if (code_page < 0) {
6893 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6894 return NULL;
6895 }
6896
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006897 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006898 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006899
Victor Stinner76a31a62011-11-04 00:05:13 +01006900 do
6901 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006902#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01006903 if (size > INT_MAX) {
6904 chunk_size = INT_MAX;
6905 final = 0;
6906 done = 0;
6907 }
6908 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006909#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01006910 {
6911 chunk_size = (int)size;
6912 final = (consumed == NULL);
6913 done = 1;
6914 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006915
Victor Stinner76a31a62011-11-04 00:05:13 +01006916 /* Skip trailing lead-byte unless 'final' is set */
6917 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6918 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006919
Victor Stinner76a31a62011-11-04 00:05:13 +01006920 if (chunk_size == 0 && done) {
6921 if (v != NULL)
6922 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006923 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01006924 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006925
Victor Stinner76a31a62011-11-04 00:05:13 +01006926
6927 converted = decode_code_page_strict(code_page, &v,
6928 s, chunk_size);
6929 if (converted == -2)
6930 converted = decode_code_page_errors(code_page, &v,
6931 s, chunk_size,
6932 errors);
6933 assert(converted != 0);
6934
6935 if (converted < 0) {
6936 Py_XDECREF(v);
6937 return NULL;
6938 }
6939
6940 if (consumed)
6941 *consumed += converted;
6942
6943 s += converted;
6944 size -= converted;
6945 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02006946
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006947 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006948}
6949
Alexander Belopolsky40018472011-02-26 01:02:56 +00006950PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02006951PyUnicode_DecodeCodePageStateful(int code_page,
6952 const char *s,
6953 Py_ssize_t size,
6954 const char *errors,
6955 Py_ssize_t *consumed)
6956{
6957 return decode_code_page_stateful(code_page, s, size, errors, consumed);
6958}
6959
6960PyObject *
6961PyUnicode_DecodeMBCSStateful(const char *s,
6962 Py_ssize_t size,
6963 const char *errors,
6964 Py_ssize_t *consumed)
6965{
6966 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6967}
6968
6969PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006970PyUnicode_DecodeMBCS(const char *s,
6971 Py_ssize_t size,
6972 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006973{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006974 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6975}
6976
Victor Stinner3a50e702011-10-18 21:21:00 +02006977static DWORD
6978encode_code_page_flags(UINT code_page, const char *errors)
6979{
6980 if (code_page == CP_UTF8) {
6981 if (winver.dwMajorVersion >= 6)
6982 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6983 and later */
6984 return WC_ERR_INVALID_CHARS;
6985 else
6986 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6987 return 0;
6988 }
6989 else if (code_page == CP_UTF7) {
6990 /* CP_UTF7 only supports flags=0 */
6991 return 0;
6992 }
6993 else {
6994 if (errors != NULL && strcmp(errors, "replace") == 0)
6995 return 0;
6996 else
6997 return WC_NO_BEST_FIT_CHARS;
6998 }
6999}
7000
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007001/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007002 * Encode a Unicode string to a Windows code page into a byte string in strict
7003 * mode.
7004 *
7005 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7006 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007007 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007008static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007009encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007010 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007011 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007012{
Victor Stinner554f3f02010-06-16 23:33:54 +00007013 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007014 BOOL *pusedDefaultChar = &usedDefaultChar;
7015 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007016 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007017 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007018 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007019 const DWORD flags = encode_code_page_flags(code_page, NULL);
7020 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007021 /* Create a substring so that we can get the UTF-16 representation
7022 of just the slice under consideration. */
7023 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007024
Martin v. Löwis3d325192011-11-04 18:23:06 +01007025 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007026
Victor Stinner3a50e702011-10-18 21:21:00 +02007027 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007028 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007029 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007030 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007031
Victor Stinner2fc507f2011-11-04 20:06:39 +01007032 substring = PyUnicode_Substring(unicode, offset, offset+len);
7033 if (substring == NULL)
7034 return -1;
7035 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7036 if (p == NULL) {
7037 Py_DECREF(substring);
7038 return -1;
7039 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007040
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007041 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007042 outsize = WideCharToMultiByte(code_page, flags,
7043 p, size,
7044 NULL, 0,
7045 NULL, pusedDefaultChar);
7046 if (outsize <= 0)
7047 goto error;
7048 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007049 if (pusedDefaultChar && *pusedDefaultChar) {
7050 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007051 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007052 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007053
Victor Stinner3a50e702011-10-18 21:21:00 +02007054 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007056 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007057 if (*outbytes == NULL) {
7058 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007060 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007061 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007062 }
7063 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007064 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007065 const Py_ssize_t n = PyBytes_Size(*outbytes);
7066 if (outsize > PY_SSIZE_T_MAX - n) {
7067 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007068 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007070 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007071 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7072 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007073 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007074 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007075 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007076 }
7077
7078 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007079 outsize = WideCharToMultiByte(code_page, flags,
7080 p, size,
7081 out, outsize,
7082 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007083 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007084 if (outsize <= 0)
7085 goto error;
7086 if (pusedDefaultChar && *pusedDefaultChar)
7087 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007089
Victor Stinner3a50e702011-10-18 21:21:00 +02007090error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007091 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007092 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7093 return -2;
7094 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007095 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007096}
7097
Victor Stinner3a50e702011-10-18 21:21:00 +02007098/*
7099 * Encode a Unicode string to a Windows code page into a byte string using a
7100 * error handler.
7101 *
7102 * Returns consumed characters if succeed, or raise a WindowsError and returns
7103 * -1 on other error.
7104 */
7105static int
7106encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007107 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007108 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007109{
Victor Stinner3a50e702011-10-18 21:21:00 +02007110 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007111 Py_ssize_t pos = unicode_offset;
7112 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007113 /* Ideally, we should get reason from FormatMessage. This is the Windows
7114 2000 English version of the message. */
7115 const char *reason = "invalid character";
7116 /* 4=maximum length of a UTF-8 sequence */
7117 char buffer[4];
7118 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7119 Py_ssize_t outsize;
7120 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 PyObject *errorHandler = NULL;
7122 PyObject *exc = NULL;
7123 PyObject *encoding_obj = NULL;
7124 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007125 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007126 PyObject *rep;
7127 int ret = -1;
7128
7129 assert(insize > 0);
7130
7131 encoding = code_page_name(code_page, &encoding_obj);
7132 if (encoding == NULL)
7133 return -1;
7134
7135 if (errors == NULL || strcmp(errors, "strict") == 0) {
7136 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7137 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007138 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 if (exc != NULL) {
7140 PyCodec_StrictErrors(exc);
7141 Py_DECREF(exc);
7142 }
7143 Py_XDECREF(encoding_obj);
7144 return -1;
7145 }
7146
7147 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7148 pusedDefaultChar = &usedDefaultChar;
7149 else
7150 pusedDefaultChar = NULL;
7151
7152 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7153 PyErr_NoMemory();
7154 goto error;
7155 }
7156 outsize = insize * Py_ARRAY_LENGTH(buffer);
7157
7158 if (*outbytes == NULL) {
7159 /* Create string object */
7160 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7161 if (*outbytes == NULL)
7162 goto error;
7163 out = PyBytes_AS_STRING(*outbytes);
7164 }
7165 else {
7166 /* Extend string object */
7167 Py_ssize_t n = PyBytes_Size(*outbytes);
7168 if (n > PY_SSIZE_T_MAX - outsize) {
7169 PyErr_NoMemory();
7170 goto error;
7171 }
7172 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7173 goto error;
7174 out = PyBytes_AS_STRING(*outbytes) + n;
7175 }
7176
7177 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007178 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007180 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7181 wchar_t chars[2];
7182 int charsize;
7183 if (ch < 0x10000) {
7184 chars[0] = (wchar_t)ch;
7185 charsize = 1;
7186 }
7187 else {
7188 ch -= 0x10000;
7189 chars[0] = 0xd800 + (ch >> 10);
7190 chars[1] = 0xdc00 + (ch & 0x3ff);
7191 charsize = 2;
7192 }
7193
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007195 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007196 buffer, Py_ARRAY_LENGTH(buffer),
7197 NULL, pusedDefaultChar);
7198 if (outsize > 0) {
7199 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7200 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007201 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 memcpy(out, buffer, outsize);
7203 out += outsize;
7204 continue;
7205 }
7206 }
7207 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7208 PyErr_SetFromWindowsErr(0);
7209 goto error;
7210 }
7211
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 rep = unicode_encode_call_errorhandler(
7213 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007214 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007215 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 if (rep == NULL)
7217 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007218 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007219
7220 if (PyBytes_Check(rep)) {
7221 outsize = PyBytes_GET_SIZE(rep);
7222 if (outsize != 1) {
7223 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7224 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7225 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7226 Py_DECREF(rep);
7227 goto error;
7228 }
7229 out = PyBytes_AS_STRING(*outbytes) + offset;
7230 }
7231 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7232 out += outsize;
7233 }
7234 else {
7235 Py_ssize_t i;
7236 enum PyUnicode_Kind kind;
7237 void *data;
7238
Benjamin Petersonbac79492012-01-14 13:34:47 -05007239 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 Py_DECREF(rep);
7241 goto error;
7242 }
7243
7244 outsize = PyUnicode_GET_LENGTH(rep);
7245 if (outsize != 1) {
7246 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7247 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7248 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7249 Py_DECREF(rep);
7250 goto error;
7251 }
7252 out = PyBytes_AS_STRING(*outbytes) + offset;
7253 }
7254 kind = PyUnicode_KIND(rep);
7255 data = PyUnicode_DATA(rep);
7256 for (i=0; i < outsize; i++) {
7257 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7258 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007259 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007260 encoding, unicode,
7261 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007262 "unable to encode error handler result to ASCII");
7263 Py_DECREF(rep);
7264 goto error;
7265 }
7266 *out = (unsigned char)ch;
7267 out++;
7268 }
7269 }
7270 Py_DECREF(rep);
7271 }
7272 /* write a NUL byte */
7273 *out = 0;
7274 outsize = out - PyBytes_AS_STRING(*outbytes);
7275 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7276 if (_PyBytes_Resize(outbytes, outsize) < 0)
7277 goto error;
7278 ret = 0;
7279
7280error:
7281 Py_XDECREF(encoding_obj);
7282 Py_XDECREF(errorHandler);
7283 Py_XDECREF(exc);
7284 return ret;
7285}
7286
Victor Stinner3a50e702011-10-18 21:21:00 +02007287static PyObject *
7288encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007289 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007290 const char *errors)
7291{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007292 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007293 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007294 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007295 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007296
Benjamin Petersonbac79492012-01-14 13:34:47 -05007297 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007298 return NULL;
7299 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007300
Victor Stinner3a50e702011-10-18 21:21:00 +02007301 if (code_page < 0) {
7302 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7303 return NULL;
7304 }
7305
Martin v. Löwis3d325192011-11-04 18:23:06 +01007306 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007307 return PyBytes_FromStringAndSize(NULL, 0);
7308
Victor Stinner7581cef2011-11-03 22:32:33 +01007309 offset = 0;
7310 do
7311 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007312#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007313 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007314 chunks. */
7315 if (len > INT_MAX/2) {
7316 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007317 done = 0;
7318 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007319 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007320#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007321 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007322 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007323 done = 1;
7324 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007325
Victor Stinner76a31a62011-11-04 00:05:13 +01007326 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007327 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007328 errors);
7329 if (ret == -2)
7330 ret = encode_code_page_errors(code_page, &outbytes,
7331 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007332 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007333 if (ret < 0) {
7334 Py_XDECREF(outbytes);
7335 return NULL;
7336 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337
Victor Stinner7581cef2011-11-03 22:32:33 +01007338 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007339 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007340 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007341
Victor Stinner3a50e702011-10-18 21:21:00 +02007342 return outbytes;
7343}
7344
7345PyObject *
7346PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7347 Py_ssize_t size,
7348 const char *errors)
7349{
Victor Stinner7581cef2011-11-03 22:32:33 +01007350 PyObject *unicode, *res;
7351 unicode = PyUnicode_FromUnicode(p, size);
7352 if (unicode == NULL)
7353 return NULL;
7354 res = encode_code_page(CP_ACP, unicode, errors);
7355 Py_DECREF(unicode);
7356 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007357}
7358
7359PyObject *
7360PyUnicode_EncodeCodePage(int code_page,
7361 PyObject *unicode,
7362 const char *errors)
7363{
Victor Stinner7581cef2011-11-03 22:32:33 +01007364 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007365}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007366
Alexander Belopolsky40018472011-02-26 01:02:56 +00007367PyObject *
7368PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007369{
7370 if (!PyUnicode_Check(unicode)) {
7371 PyErr_BadArgument();
7372 return NULL;
7373 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007374 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007375}
7376
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007377#undef NEED_RETRY
7378
Victor Stinner99b95382011-07-04 14:23:54 +02007379#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007380
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381/* --- Character Mapping Codec -------------------------------------------- */
7382
Alexander Belopolsky40018472011-02-26 01:02:56 +00007383PyObject *
7384PyUnicode_DecodeCharmap(const char *s,
7385 Py_ssize_t size,
7386 PyObject *mapping,
7387 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007389 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007390 Py_ssize_t startinpos;
7391 Py_ssize_t endinpos;
7392 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007393 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007394 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007395 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007396 PyObject *errorHandler = NULL;
7397 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007398
Guido van Rossumd57fd912000-03-10 22:53:23 +00007399 /* Default to Latin-1 */
7400 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007401 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007403 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007407 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007408 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007409 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007410 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007411 Py_ssize_t maplen;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007412 enum PyUnicode_Kind mapkind;
7413 void *mapdata;
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007414 Py_UCS4 x;
7415
Benjamin Petersonbac79492012-01-14 13:34:47 -05007416 if (PyUnicode_READY(mapping) == -1)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007417 return NULL;
7418
7419 maplen = PyUnicode_GET_LENGTH(mapping);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007420 mapdata = PyUnicode_DATA(mapping);
7421 mapkind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 while (s < e) {
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007423 unsigned char ch;
7424 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7425 enum PyUnicode_Kind outkind = PyUnicode_KIND(v);
7426 if (outkind == PyUnicode_1BYTE_KIND) {
7427 void *outdata = PyUnicode_DATA(v);
7428 Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(v);
7429 while (s < e) {
7430 unsigned char ch = *s;
7431 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7432 if (x > maxchar)
7433 goto Error;
7434 PyUnicode_WRITE(PyUnicode_1BYTE_KIND, outdata, outpos++, x);
7435 ++s;
7436 }
7437 break;
7438 }
7439 else if (outkind == PyUnicode_2BYTE_KIND) {
7440 void *outdata = PyUnicode_DATA(v);
7441 while (s < e) {
7442 unsigned char ch = *s;
7443 x = PyUnicode_READ(PyUnicode_2BYTE_KIND, mapdata, ch);
7444 if (x == 0xFFFE)
7445 goto Error;
7446 PyUnicode_WRITE(PyUnicode_2BYTE_KIND, outdata, outpos++, x);
7447 ++s;
7448 }
7449 break;
7450 }
7451 }
7452 ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007453
Benjamin Peterson29060642009-01-31 22:14:21 +00007454 if (ch < maplen)
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007455 x = PyUnicode_READ(mapkind, mapdata, ch);
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007456 else
7457 x = 0xfffe; /* invalid value */
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007458Error:
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007459 if (x == 0xfffe)
7460 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007461 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 startinpos = s-starts;
7463 endinpos = startinpos+1;
7464 if (unicode_decode_call_errorhandler(
7465 errors, &errorHandler,
7466 "charmap", "character maps to <undefined>",
7467 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007468 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007469 goto onError;
7470 }
7471 continue;
7472 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007473
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007474 if (unicode_putchar(&v, &outpos, x) < 0)
7475 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007476 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007477 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007478 }
7479 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 while (s < e) {
7481 unsigned char ch = *s;
7482 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007483
Benjamin Peterson29060642009-01-31 22:14:21 +00007484 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7485 w = PyLong_FromLong((long)ch);
7486 if (w == NULL)
7487 goto onError;
7488 x = PyObject_GetItem(mapping, w);
7489 Py_DECREF(w);
7490 if (x == NULL) {
7491 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7492 /* No mapping found means: mapping is undefined. */
7493 PyErr_Clear();
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007494 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007495 } else
7496 goto onError;
7497 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007498
Benjamin Peterson29060642009-01-31 22:14:21 +00007499 /* Apply mapping */
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007500 if (x == Py_None)
7501 goto Undefined;
Benjamin Peterson29060642009-01-31 22:14:21 +00007502 if (PyLong_Check(x)) {
7503 long value = PyLong_AS_LONG(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007504 if (value == 0xFFFE)
7505 goto Undefined;
Antoine Pitroua1f76552012-09-23 20:00:04 +02007506 if (value < 0 || value > MAX_UNICODE) {
7507 PyErr_Format(PyExc_TypeError,
7508 "character mapping must be in range(0x%lx)",
7509 (unsigned long)MAX_UNICODE + 1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 Py_DECREF(x);
7511 goto onError;
7512 }
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007513 if (unicode_putchar(&v, &outpos, value) < 0) {
7514 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007515 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007516 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007517 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007518 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007519 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007520
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007521 if (PyUnicode_READY(x) == -1) {
7522 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007523 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007524 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007525 targetsize = PyUnicode_GET_LENGTH(x);
7526
7527 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007528 /* 1-1 mapping */
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007529 Py_UCS4 value = PyUnicode_READ_CHAR(x, 0);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007530 if (value == 0xFFFE)
7531 goto Undefined;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007532 if (unicode_putchar(&v, &outpos, value) < 0) {
7533 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007534 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007535 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007536 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007537 else if (targetsize > 1) {
7538 /* 1-n mapping */
7539 if (targetsize > extrachars) {
7540 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007541 Py_ssize_t needed = (targetsize - extrachars) + \
7542 (targetsize << 2);
7543 extrachars += needed;
7544 /* XXX overflow detection missing */
Victor Stinner16e6a802011-12-12 13:24:15 +01007545 if (unicode_resize(&v,
7546 PyUnicode_GET_LENGTH(v) + needed) < 0)
7547 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007548 Py_DECREF(x);
7549 goto onError;
7550 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007551 }
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007552 if (unicode_widen(&v, outpos,
7553 PyUnicode_MAX_CHAR_VALUE(x)) < 0) {
7554 Py_DECREF(x);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007555 goto onError;
Serhiy Storchakaafb1cb52013-01-29 12:13:22 +02007556 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007557 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7558 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007559 extrachars -= targetsize;
7560 }
7561 /* 1-0 mapping: skip the character */
7562 }
7563 else {
7564 /* wrong return value */
7565 PyErr_SetString(PyExc_TypeError,
7566 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007567 Py_DECREF(x);
7568 goto onError;
7569 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007570 Py_DECREF(x);
7571 ++s;
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007572 continue;
7573Undefined:
7574 /* undefined mapping */
7575 Py_XDECREF(x);
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007576 startinpos = s-starts;
7577 endinpos = startinpos+1;
7578 if (unicode_decode_call_errorhandler(
7579 errors, &errorHandler,
7580 "charmap", "character maps to <undefined>",
7581 &starts, &e, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka45d16d92013-01-15 15:01:20 +02007582 &v, &outpos)) {
Serhiy Storchaka4fb8cae2013-01-15 14:43:21 +02007583 goto onError;
7584 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007586 }
Victor Stinner16e6a802011-12-12 13:24:15 +01007587 if (unicode_resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007588 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007589 Py_XDECREF(errorHandler);
7590 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007591 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007592
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007594 Py_XDECREF(errorHandler);
7595 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007596 Py_XDECREF(v);
7597 return NULL;
7598}
7599
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007600/* Charmap encoding: the lookup table */
7601
Alexander Belopolsky40018472011-02-26 01:02:56 +00007602struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 PyObject_HEAD
7604 unsigned char level1[32];
7605 int count2, count3;
7606 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007607};
7608
7609static PyObject*
7610encoding_map_size(PyObject *obj, PyObject* args)
7611{
7612 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007613 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007615}
7616
7617static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007618 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 PyDoc_STR("Return the size (in bytes) of this object") },
7620 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007621};
7622
7623static void
7624encoding_map_dealloc(PyObject* o)
7625{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007626 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007627}
7628
7629static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007630 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 "EncodingMap", /*tp_name*/
7632 sizeof(struct encoding_map), /*tp_basicsize*/
7633 0, /*tp_itemsize*/
7634 /* methods */
7635 encoding_map_dealloc, /*tp_dealloc*/
7636 0, /*tp_print*/
7637 0, /*tp_getattr*/
7638 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007639 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007640 0, /*tp_repr*/
7641 0, /*tp_as_number*/
7642 0, /*tp_as_sequence*/
7643 0, /*tp_as_mapping*/
7644 0, /*tp_hash*/
7645 0, /*tp_call*/
7646 0, /*tp_str*/
7647 0, /*tp_getattro*/
7648 0, /*tp_setattro*/
7649 0, /*tp_as_buffer*/
7650 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7651 0, /*tp_doc*/
7652 0, /*tp_traverse*/
7653 0, /*tp_clear*/
7654 0, /*tp_richcompare*/
7655 0, /*tp_weaklistoffset*/
7656 0, /*tp_iter*/
7657 0, /*tp_iternext*/
7658 encoding_map_methods, /*tp_methods*/
7659 0, /*tp_members*/
7660 0, /*tp_getset*/
7661 0, /*tp_base*/
7662 0, /*tp_dict*/
7663 0, /*tp_descr_get*/
7664 0, /*tp_descr_set*/
7665 0, /*tp_dictoffset*/
7666 0, /*tp_init*/
7667 0, /*tp_alloc*/
7668 0, /*tp_new*/
7669 0, /*tp_free*/
7670 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007671};
7672
7673PyObject*
7674PyUnicode_BuildEncodingMap(PyObject* string)
7675{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007676 PyObject *result;
7677 struct encoding_map *mresult;
7678 int i;
7679 int need_dict = 0;
7680 unsigned char level1[32];
7681 unsigned char level2[512];
7682 unsigned char *mlevel1, *mlevel2, *mlevel3;
7683 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007684 int kind;
7685 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007686 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007687 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007688
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007689 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007690 PyErr_BadArgument();
7691 return NULL;
7692 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007693 kind = PyUnicode_KIND(string);
7694 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007695 length = PyUnicode_GET_LENGTH(string);
7696 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007697 memset(level1, 0xFF, sizeof level1);
7698 memset(level2, 0xFF, sizeof level2);
7699
7700 /* If there isn't a one-to-one mapping of NULL to \0,
7701 or if there are non-BMP characters, we need to use
7702 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007703 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007704 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007705 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007706 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007707 ch = PyUnicode_READ(kind, data, i);
7708 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007709 need_dict = 1;
7710 break;
7711 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007712 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007713 /* unmapped character */
7714 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007715 l1 = ch >> 11;
7716 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007717 if (level1[l1] == 0xFF)
7718 level1[l1] = count2++;
7719 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007720 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007721 }
7722
7723 if (count2 >= 0xFF || count3 >= 0xFF)
7724 need_dict = 1;
7725
7726 if (need_dict) {
7727 PyObject *result = PyDict_New();
7728 PyObject *key, *value;
7729 if (!result)
7730 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007731 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007732 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007733 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007734 if (!key || !value)
7735 goto failed1;
7736 if (PyDict_SetItem(result, key, value) == -1)
7737 goto failed1;
7738 Py_DECREF(key);
7739 Py_DECREF(value);
7740 }
7741 return result;
7742 failed1:
7743 Py_XDECREF(key);
7744 Py_XDECREF(value);
7745 Py_DECREF(result);
7746 return NULL;
7747 }
7748
7749 /* Create a three-level trie */
7750 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7751 16*count2 + 128*count3 - 1);
7752 if (!result)
7753 return PyErr_NoMemory();
7754 PyObject_Init(result, &EncodingMapType);
7755 mresult = (struct encoding_map*)result;
7756 mresult->count2 = count2;
7757 mresult->count3 = count3;
7758 mlevel1 = mresult->level1;
7759 mlevel2 = mresult->level23;
7760 mlevel3 = mresult->level23 + 16*count2;
7761 memcpy(mlevel1, level1, 32);
7762 memset(mlevel2, 0xFF, 16*count2);
7763 memset(mlevel3, 0, 128*count3);
7764 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007765 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007766 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007767 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7768 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007769 /* unmapped character */
7770 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007771 o1 = ch>>11;
7772 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007773 i2 = 16*mlevel1[o1] + o2;
7774 if (mlevel2[i2] == 0xFF)
7775 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007776 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007777 i3 = 128*mlevel2[i2] + o3;
7778 mlevel3[i3] = i;
7779 }
7780 return result;
7781}
7782
7783static int
Victor Stinner22168992011-11-20 17:09:18 +01007784encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007785{
7786 struct encoding_map *map = (struct encoding_map*)mapping;
7787 int l1 = c>>11;
7788 int l2 = (c>>7) & 0xF;
7789 int l3 = c & 0x7F;
7790 int i;
7791
Victor Stinner22168992011-11-20 17:09:18 +01007792 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007793 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007794 if (c == 0)
7795 return 0;
7796 /* level 1*/
7797 i = map->level1[l1];
7798 if (i == 0xFF) {
7799 return -1;
7800 }
7801 /* level 2*/
7802 i = map->level23[16*i+l2];
7803 if (i == 0xFF) {
7804 return -1;
7805 }
7806 /* level 3 */
7807 i = map->level23[16*map->count2 + 128*i + l3];
7808 if (i == 0) {
7809 return -1;
7810 }
7811 return i;
7812}
7813
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007814/* Lookup the character ch in the mapping. If the character
7815 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007816 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007817static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007818charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007819{
Christian Heimes217cfd12007-12-02 14:31:20 +00007820 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007821 PyObject *x;
7822
7823 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007824 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007825 x = PyObject_GetItem(mapping, w);
7826 Py_DECREF(w);
7827 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007828 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7829 /* No mapping found means: mapping is undefined. */
7830 PyErr_Clear();
7831 x = Py_None;
7832 Py_INCREF(x);
7833 return x;
7834 } else
7835 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007837 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007838 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007839 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 long value = PyLong_AS_LONG(x);
7841 if (value < 0 || value > 255) {
7842 PyErr_SetString(PyExc_TypeError,
7843 "character mapping must be in range(256)");
7844 Py_DECREF(x);
7845 return NULL;
7846 }
7847 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007849 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007851 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 /* wrong return value */
7853 PyErr_Format(PyExc_TypeError,
7854 "character mapping must return integer, bytes or None, not %.400s",
7855 x->ob_type->tp_name);
7856 Py_DECREF(x);
7857 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007858 }
7859}
7860
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007861static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007862charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007863{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007864 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7865 /* exponentially overallocate to minimize reallocations */
7866 if (requiredsize < 2*outsize)
7867 requiredsize = 2*outsize;
7868 if (_PyBytes_Resize(outobj, requiredsize))
7869 return -1;
7870 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007871}
7872
Benjamin Peterson14339b62009-01-31 16:36:08 +00007873typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007875} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007876/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007877 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007878 space is available. Return a new reference to the object that
7879 was put in the output buffer, or Py_None, if the mapping was undefined
7880 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007881 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007882static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007883charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007884 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007885{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007886 PyObject *rep;
7887 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007888 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007889
Christian Heimes90aa7642007-12-19 02:45:37 +00007890 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007891 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007893 if (res == -1)
7894 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007895 if (outsize<requiredsize)
7896 if (charmapencode_resize(outobj, outpos, requiredsize))
7897 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007898 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007899 outstart[(*outpos)++] = (char)res;
7900 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007901 }
7902
7903 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007904 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007906 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007907 Py_DECREF(rep);
7908 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007909 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007910 if (PyLong_Check(rep)) {
7911 Py_ssize_t requiredsize = *outpos+1;
7912 if (outsize<requiredsize)
7913 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7914 Py_DECREF(rep);
7915 return enc_EXCEPTION;
7916 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007917 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007918 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007919 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007920 else {
7921 const char *repchars = PyBytes_AS_STRING(rep);
7922 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7923 Py_ssize_t requiredsize = *outpos+repsize;
7924 if (outsize<requiredsize)
7925 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7926 Py_DECREF(rep);
7927 return enc_EXCEPTION;
7928 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007929 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 memcpy(outstart + *outpos, repchars, repsize);
7931 *outpos += repsize;
7932 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007933 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007934 Py_DECREF(rep);
7935 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007936}
7937
7938/* handle an error in PyUnicode_EncodeCharmap
7939 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007940static int
7941charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007942 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007943 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007944 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007945 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007946{
7947 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007948 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007949 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01007950 enum PyUnicode_Kind kind;
7951 void *data;
7952 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007953 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007954 Py_ssize_t collstartpos = *inpos;
7955 Py_ssize_t collendpos = *inpos+1;
7956 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007957 char *encoding = "charmap";
7958 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007959 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007960 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05007961 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007962
Benjamin Petersonbac79492012-01-14 13:34:47 -05007963 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007964 return -1;
7965 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007966 /* find all unencodable characters */
7967 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007968 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007969 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007970 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05007971 val = encoding_map_lookup(ch, mapping);
7972 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 break;
7974 ++collendpos;
7975 continue;
7976 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007977
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007978 ch = PyUnicode_READ_CHAR(unicode, collendpos);
7979 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 if (rep==NULL)
7981 return -1;
7982 else if (rep!=Py_None) {
7983 Py_DECREF(rep);
7984 break;
7985 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007986 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007988 }
7989 /* cache callback name lookup
7990 * (if not done yet, i.e. it's the first error) */
7991 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007992 if ((errors==NULL) || (!strcmp(errors, "strict")))
7993 *known_errorHandler = 1;
7994 else if (!strcmp(errors, "replace"))
7995 *known_errorHandler = 2;
7996 else if (!strcmp(errors, "ignore"))
7997 *known_errorHandler = 3;
7998 else if (!strcmp(errors, "xmlcharrefreplace"))
7999 *known_errorHandler = 4;
8000 else
8001 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008002 }
8003 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008004 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008005 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008006 return -1;
8007 case 2: /* replace */
8008 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008009 x = charmapencode_output('?', mapping, res, respos);
8010 if (x==enc_EXCEPTION) {
8011 return -1;
8012 }
8013 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008014 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 return -1;
8016 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008017 }
8018 /* fall through */
8019 case 3: /* ignore */
8020 *inpos = collendpos;
8021 break;
8022 case 4: /* xmlcharrefreplace */
8023 /* generate replacement (temporarily (mis)uses p) */
8024 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 char buffer[2+29+1+1];
8026 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008027 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 for (cp = buffer; *cp; ++cp) {
8029 x = charmapencode_output(*cp, mapping, res, respos);
8030 if (x==enc_EXCEPTION)
8031 return -1;
8032 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008033 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 return -1;
8035 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008036 }
8037 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008038 *inpos = collendpos;
8039 break;
8040 default:
8041 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008042 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008044 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008046 if (PyBytes_Check(repunicode)) {
8047 /* Directly copy bytes result to output. */
8048 Py_ssize_t outsize = PyBytes_Size(*res);
8049 Py_ssize_t requiredsize;
8050 repsize = PyBytes_Size(repunicode);
8051 requiredsize = *respos + repsize;
8052 if (requiredsize > outsize)
8053 /* Make room for all additional bytes. */
8054 if (charmapencode_resize(res, respos, requiredsize)) {
8055 Py_DECREF(repunicode);
8056 return -1;
8057 }
8058 memcpy(PyBytes_AsString(*res) + *respos,
8059 PyBytes_AsString(repunicode), repsize);
8060 *respos += repsize;
8061 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008062 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008063 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008064 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008065 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008066 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008067 Py_DECREF(repunicode);
8068 return -1;
8069 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008070 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008071 data = PyUnicode_DATA(repunicode);
8072 kind = PyUnicode_KIND(repunicode);
8073 for (index = 0; index < repsize; index++) {
8074 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8075 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008076 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008077 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 return -1;
8079 }
8080 else if (x==enc_FAILED) {
8081 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008082 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 return -1;
8084 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008085 }
8086 *inpos = newpos;
8087 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088 }
8089 return 0;
8090}
8091
Alexander Belopolsky40018472011-02-26 01:02:56 +00008092PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008093_PyUnicode_EncodeCharmap(PyObject *unicode,
8094 PyObject *mapping,
8095 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008096{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097 /* output object */
8098 PyObject *res = NULL;
8099 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008100 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008101 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008102 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008103 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104 PyObject *errorHandler = NULL;
8105 PyObject *exc = NULL;
8106 /* the following variable is used for caching string comparisons
8107 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8108 * 3=ignore, 4=xmlcharrefreplace */
8109 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008110
Benjamin Petersonbac79492012-01-14 13:34:47 -05008111 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008112 return NULL;
8113 size = PyUnicode_GET_LENGTH(unicode);
8114
Guido van Rossumd57fd912000-03-10 22:53:23 +00008115 /* Default to Latin-1 */
8116 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008117 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008118
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008119 /* allocate enough for a simple encoding without
8120 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008121 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008122 if (res == NULL)
8123 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008124 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008126
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008127 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008128 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008130 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 if (x==enc_EXCEPTION) /* error */
8132 goto onError;
8133 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008134 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 &exc,
8136 &known_errorHandler, &errorHandler, errors,
8137 &res, &respos)) {
8138 goto onError;
8139 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008140 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 else
8142 /* done with this character => adjust input position */
8143 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008145
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008146 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008147 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008148 if (_PyBytes_Resize(&res, respos) < 0)
8149 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008150
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008151 Py_XDECREF(exc);
8152 Py_XDECREF(errorHandler);
8153 return res;
8154
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008156 Py_XDECREF(res);
8157 Py_XDECREF(exc);
8158 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159 return NULL;
8160}
8161
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008162/* Deprecated */
8163PyObject *
8164PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8165 Py_ssize_t size,
8166 PyObject *mapping,
8167 const char *errors)
8168{
8169 PyObject *result;
8170 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8171 if (unicode == NULL)
8172 return NULL;
8173 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8174 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008175 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008176}
8177
Alexander Belopolsky40018472011-02-26 01:02:56 +00008178PyObject *
8179PyUnicode_AsCharmapString(PyObject *unicode,
8180 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181{
8182 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 PyErr_BadArgument();
8184 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008185 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008186 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008187}
8188
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008189/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008190static void
8191make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008192 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008193 Py_ssize_t startpos, Py_ssize_t endpos,
8194 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008196 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008197 *exceptionObject = _PyUnicodeTranslateError_Create(
8198 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008199 }
8200 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8202 goto onError;
8203 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8204 goto onError;
8205 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8206 goto onError;
8207 return;
8208 onError:
8209 Py_DECREF(*exceptionObject);
8210 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008211 }
8212}
8213
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008214/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008215static void
8216raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008217 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008218 Py_ssize_t startpos, Py_ssize_t endpos,
8219 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008220{
8221 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008222 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008223 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008225}
8226
8227/* error handling callback helper:
8228 build arguments, call the callback and check the arguments,
8229 put the result into newpos and return the replacement string, which
8230 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008231static PyObject *
8232unicode_translate_call_errorhandler(const char *errors,
8233 PyObject **errorHandler,
8234 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008235 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008236 Py_ssize_t startpos, Py_ssize_t endpos,
8237 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008238{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008239 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008240
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008241 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008242 PyObject *restuple;
8243 PyObject *resunicode;
8244
8245 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008246 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008247 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008249 }
8250
8251 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008252 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008253 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008255
8256 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008258 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008261 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 Py_DECREF(restuple);
8263 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 }
8265 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 &resunicode, &i_newpos)) {
8267 Py_DECREF(restuple);
8268 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008270 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008271 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008272 else
8273 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008274 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8276 Py_DECREF(restuple);
8277 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008278 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008279 Py_INCREF(resunicode);
8280 Py_DECREF(restuple);
8281 return resunicode;
8282}
8283
8284/* Lookup the character ch in the mapping and put the result in result,
8285 which must be decrefed by the caller.
8286 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008287static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008288charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008289{
Christian Heimes217cfd12007-12-02 14:31:20 +00008290 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008291 PyObject *x;
8292
8293 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008294 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008295 x = PyObject_GetItem(mapping, w);
8296 Py_DECREF(w);
8297 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008298 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8299 /* No mapping found means: use 1:1 mapping. */
8300 PyErr_Clear();
8301 *result = NULL;
8302 return 0;
8303 } else
8304 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 }
8306 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 *result = x;
8308 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008309 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008310 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 long value = PyLong_AS_LONG(x);
8312 long max = PyUnicode_GetMax();
8313 if (value < 0 || value > max) {
8314 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008315 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 Py_DECREF(x);
8317 return -1;
8318 }
8319 *result = x;
8320 return 0;
8321 }
8322 else if (PyUnicode_Check(x)) {
8323 *result = x;
8324 return 0;
8325 }
8326 else {
8327 /* wrong return value */
8328 PyErr_SetString(PyExc_TypeError,
8329 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008330 Py_DECREF(x);
8331 return -1;
8332 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333}
8334/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 if not reallocate and adjust various state variables.
8336 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008337static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008338charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008339 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008340{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008341 Py_ssize_t oldsize = *psize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008342 Py_UCS4 *new_outobj;
Walter Dörwald4894c302003-10-24 14:25:28 +00008343 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008344 /* exponentially overallocate to minimize reallocations */
8345 if (requiredsize < 2 * oldsize)
8346 requiredsize = 2 * oldsize;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008347 new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8348 if (new_outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 return -1;
Kristjan Valur Jonsson85634d72012-05-31 09:37:31 +00008350 *outobj = new_outobj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008352 }
8353 return 0;
8354}
8355/* lookup the character, put the result in the output string and adjust
8356 various state variables. Return a new reference to the object that
8357 was put in the output buffer in *result, or Py_None, if the mapping was
8358 undefined (in which case no character was written).
8359 The called must decref result.
8360 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008361static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8363 PyObject *mapping, Py_UCS4 **output,
8364 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008365 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008367 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8368 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008373 }
8374 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008376 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008378 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379 }
8380 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 Py_ssize_t repsize;
8382 if (PyUnicode_READY(*res) == -1)
8383 return -1;
8384 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 if (repsize==1) {
8386 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 }
8389 else if (repsize!=0) {
8390 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 Py_ssize_t requiredsize = *opos +
8392 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008394 Py_ssize_t i;
8395 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008397 for(i = 0; i < repsize; i++)
8398 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400 }
8401 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403 return 0;
8404}
8405
Alexander Belopolsky40018472011-02-26 01:02:56 +00008406PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008407_PyUnicode_TranslateCharmap(PyObject *input,
8408 PyObject *mapping,
8409 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008411 /* input object */
8412 char *idata;
8413 Py_ssize_t size, i;
8414 int kind;
8415 /* output buffer */
8416 Py_UCS4 *output = NULL;
8417 Py_ssize_t osize;
8418 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008420 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421 char *reason = "character maps to <undefined>";
8422 PyObject *errorHandler = NULL;
8423 PyObject *exc = NULL;
8424 /* the following variable is used for caching string comparisons
8425 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8426 * 3=ignore, 4=xmlcharrefreplace */
8427 int known_errorHandler = -1;
8428
Guido van Rossumd57fd912000-03-10 22:53:23 +00008429 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 PyErr_BadArgument();
8431 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008432 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008434 if (PyUnicode_READY(input) == -1)
8435 return NULL;
8436 idata = (char*)PyUnicode_DATA(input);
8437 kind = PyUnicode_KIND(input);
8438 size = PyUnicode_GET_LENGTH(input);
8439 i = 0;
8440
8441 if (size == 0) {
8442 Py_INCREF(input);
8443 return input;
8444 }
8445
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008446 /* allocate enough for a simple 1:1 translation without
8447 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008448 osize = size;
8449 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8450 opos = 0;
8451 if (output == NULL) {
8452 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008456 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 /* try to encode it */
8458 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008459 if (charmaptranslate_output(input, i, mapping,
8460 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 Py_XDECREF(x);
8462 goto onError;
8463 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008464 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008466 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 else { /* untranslatable character */
8468 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8469 Py_ssize_t repsize;
8470 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008471 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473 Py_ssize_t collstart = i;
8474 Py_ssize_t collend = i+1;
8475 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008476
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008478 while (collend < size) {
8479 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 goto onError;
8481 Py_XDECREF(x);
8482 if (x!=Py_None)
8483 break;
8484 ++collend;
8485 }
8486 /* cache callback name lookup
8487 * (if not done yet, i.e. it's the first error) */
8488 if (known_errorHandler==-1) {
8489 if ((errors==NULL) || (!strcmp(errors, "strict")))
8490 known_errorHandler = 1;
8491 else if (!strcmp(errors, "replace"))
8492 known_errorHandler = 2;
8493 else if (!strcmp(errors, "ignore"))
8494 known_errorHandler = 3;
8495 else if (!strcmp(errors, "xmlcharrefreplace"))
8496 known_errorHandler = 4;
8497 else
8498 known_errorHandler = 0;
8499 }
8500 switch (known_errorHandler) {
8501 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008502 raise_translate_exception(&exc, input, collstart,
8503 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008504 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 case 2: /* replace */
8506 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 for (coll = collstart; coll<collend; coll++)
8508 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 /* fall through */
8510 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008511 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 break;
8513 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008514 /* generate replacement (temporarily (mis)uses i) */
8515 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 char buffer[2+29+1+1];
8517 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008518 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8519 if (charmaptranslate_makespace(&output, &osize,
8520 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 goto onError;
8522 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008523 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008525 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008526 break;
8527 default:
8528 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529 reason, input, &exc,
8530 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008531 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 goto onError;
Benjamin Peterson9ca3ffa2012-01-01 16:04:29 -06008533 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008534 Py_DECREF(repunicode);
8535 goto onError;
8536 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008537 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 repsize = PyUnicode_GET_LENGTH(repunicode);
8539 if (charmaptranslate_makespace(&output, &osize,
8540 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 Py_DECREF(repunicode);
8542 goto onError;
8543 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544 for (uni2 = 0; repsize-->0; ++uni2)
8545 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8546 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008548 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008549 }
8550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8552 if (!res)
8553 goto onError;
8554 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008555 Py_XDECREF(exc);
8556 Py_XDECREF(errorHandler);
8557 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008558
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 Py_XDECREF(exc);
8562 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008563 return NULL;
8564}
8565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566/* Deprecated. Use PyUnicode_Translate instead. */
8567PyObject *
8568PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8569 Py_ssize_t size,
8570 PyObject *mapping,
8571 const char *errors)
8572{
Christian Heimes5f520f42012-09-11 14:03:25 +02008573 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008574 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8575 if (!unicode)
8576 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008577 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8578 Py_DECREF(unicode);
8579 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008580}
8581
Alexander Belopolsky40018472011-02-26 01:02:56 +00008582PyObject *
8583PyUnicode_Translate(PyObject *str,
8584 PyObject *mapping,
8585 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008586{
8587 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008588
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589 str = PyUnicode_FromObject(str);
8590 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008591 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008593 Py_DECREF(str);
8594 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595}
Tim Petersced69f82003-09-16 20:30:58 +00008596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008597static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008598fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599{
8600 /* No need to call PyUnicode_READY(self) because this function is only
8601 called as a callback from fixup() which does it already. */
8602 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8603 const int kind = PyUnicode_KIND(self);
8604 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008605 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008606 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 Py_ssize_t i;
8608
8609 for (i = 0; i < len; ++i) {
8610 ch = PyUnicode_READ(kind, data, i);
8611 fixed = 0;
8612 if (ch > 127) {
8613 if (Py_UNICODE_ISSPACE(ch))
8614 fixed = ' ';
8615 else {
8616 const int decimal = Py_UNICODE_TODECIMAL(ch);
8617 if (decimal >= 0)
8618 fixed = '0' + decimal;
8619 }
8620 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008621 modified = 1;
Victor Stinnere6abb482012-05-02 01:15:40 +02008622 maxchar = MAX_MAXCHAR(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008623 PyUnicode_WRITE(kind, data, i, fixed);
8624 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008625 else
8626 maxchar = MAX_MAXCHAR(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 }
8629
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008630 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008631}
8632
8633PyObject *
8634_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8635{
8636 if (!PyUnicode_Check(unicode)) {
8637 PyErr_BadInternalCall();
8638 return NULL;
8639 }
8640 if (PyUnicode_READY(unicode) == -1)
8641 return NULL;
8642 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8643 /* If the string is already ASCII, just return the same string */
8644 Py_INCREF(unicode);
8645 return unicode;
8646 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008647 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648}
8649
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008650PyObject *
8651PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8652 Py_ssize_t length)
8653{
Victor Stinnerf0124502011-11-21 23:12:56 +01008654 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008655 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008656 Py_UCS4 maxchar;
8657 enum PyUnicode_Kind kind;
8658 void *data;
8659
Victor Stinner99d7ad02012-02-22 13:37:39 +01008660 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008661 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008662 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008663 if (ch > 127) {
8664 int decimal = Py_UNICODE_TODECIMAL(ch);
8665 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008666 ch = '0' + decimal;
Victor Stinnere6abb482012-05-02 01:15:40 +02008667 maxchar = MAX_MAXCHAR(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008668 }
8669 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008670
8671 /* Copy to a new string */
8672 decimal = PyUnicode_New(length, maxchar);
8673 if (decimal == NULL)
8674 return decimal;
8675 kind = PyUnicode_KIND(decimal);
8676 data = PyUnicode_DATA(decimal);
8677 /* Iterate over code points */
8678 for (i = 0; i < length; i++) {
8679 Py_UNICODE ch = s[i];
8680 if (ch > 127) {
8681 int decimal = Py_UNICODE_TODECIMAL(ch);
8682 if (decimal >= 0)
8683 ch = '0' + decimal;
8684 }
8685 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008687 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008688}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008689/* --- Decimal Encoder ---------------------------------------------------- */
8690
Alexander Belopolsky40018472011-02-26 01:02:56 +00008691int
8692PyUnicode_EncodeDecimal(Py_UNICODE *s,
8693 Py_ssize_t length,
8694 char *output,
8695 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008696{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008697 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008698 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008699 enum PyUnicode_Kind kind;
8700 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008701
8702 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 PyErr_BadArgument();
8704 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008705 }
8706
Victor Stinner42bf7752011-11-21 22:52:58 +01008707 unicode = PyUnicode_FromUnicode(s, length);
8708 if (unicode == NULL)
8709 return -1;
8710
Benjamin Petersonbac79492012-01-14 13:34:47 -05008711 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008712 Py_DECREF(unicode);
8713 return -1;
8714 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008715 kind = PyUnicode_KIND(unicode);
8716 data = PyUnicode_DATA(unicode);
8717
Victor Stinnerb84d7232011-11-22 01:50:07 +01008718 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008719 PyObject *exc;
8720 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008722 Py_ssize_t startpos;
8723
8724 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008725
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008727 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008728 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008730 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 decimal = Py_UNICODE_TODECIMAL(ch);
8732 if (decimal >= 0) {
8733 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008734 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008735 continue;
8736 }
8737 if (0 < ch && ch < 256) {
8738 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008739 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 continue;
8741 }
Victor Stinner6345be92011-11-25 20:09:01 +01008742
Victor Stinner42bf7752011-11-21 22:52:58 +01008743 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008744 exc = NULL;
8745 raise_encode_exception(&exc, "decimal", unicode,
8746 startpos, startpos+1,
8747 "invalid decimal Unicode string");
8748 Py_XDECREF(exc);
8749 Py_DECREF(unicode);
8750 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008751 }
8752 /* 0-terminate the output string */
8753 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008754 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008755 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008756}
8757
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758/* --- Helpers ------------------------------------------------------------ */
8759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008760static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008761any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008762 Py_ssize_t start,
8763 Py_ssize_t end)
8764{
8765 int kind1, kind2, kind;
8766 void *buf1, *buf2;
8767 Py_ssize_t len1, len2, result;
8768
8769 kind1 = PyUnicode_KIND(s1);
8770 kind2 = PyUnicode_KIND(s2);
8771 kind = kind1 > kind2 ? kind1 : kind2;
8772 buf1 = PyUnicode_DATA(s1);
8773 buf2 = PyUnicode_DATA(s2);
8774 if (kind1 != kind)
8775 buf1 = _PyUnicode_AsKind(s1, kind);
8776 if (!buf1)
8777 return -2;
8778 if (kind2 != kind)
8779 buf2 = _PyUnicode_AsKind(s2, kind);
8780 if (!buf2) {
8781 if (kind1 != kind) PyMem_Free(buf1);
8782 return -2;
8783 }
8784 len1 = PyUnicode_GET_LENGTH(s1);
8785 len2 = PyUnicode_GET_LENGTH(s2);
8786
Victor Stinner794d5672011-10-10 03:21:36 +02008787 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008788 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008789 case PyUnicode_1BYTE_KIND:
8790 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8791 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8792 else
8793 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8794 break;
8795 case PyUnicode_2BYTE_KIND:
8796 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8797 break;
8798 case PyUnicode_4BYTE_KIND:
8799 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8800 break;
8801 default:
8802 assert(0); result = -2;
8803 }
8804 }
8805 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06008806 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02008807 case PyUnicode_1BYTE_KIND:
8808 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8809 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8810 else
8811 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8812 break;
8813 case PyUnicode_2BYTE_KIND:
8814 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8815 break;
8816 case PyUnicode_4BYTE_KIND:
8817 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8818 break;
8819 default:
8820 assert(0); result = -2;
8821 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008822 }
8823
8824 if (kind1 != kind)
8825 PyMem_Free(buf1);
8826 if (kind2 != kind)
8827 PyMem_Free(buf2);
8828
8829 return result;
8830}
8831
8832Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01008833_PyUnicode_InsertThousandsGrouping(
8834 PyObject *unicode, Py_ssize_t index,
8835 Py_ssize_t n_buffer,
8836 void *digits, Py_ssize_t n_digits,
8837 Py_ssize_t min_width,
8838 const char *grouping, PyObject *thousands_sep,
8839 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840{
Victor Stinner41a863c2012-02-24 00:37:51 +01008841 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008842 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01008843 Py_ssize_t thousands_sep_len;
8844 Py_ssize_t len;
8845
8846 if (unicode != NULL) {
8847 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008848 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01008849 }
8850 else {
8851 kind = PyUnicode_1BYTE_KIND;
8852 data = NULL;
8853 }
8854 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8855 thousands_sep_data = PyUnicode_DATA(thousands_sep);
8856 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8857 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01008858 if (thousands_sep_kind < kind) {
8859 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8860 if (!thousands_sep_data)
8861 return -1;
8862 }
8863 else {
8864 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8865 if (!data)
8866 return -1;
8867 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008868 }
8869
Benjamin Petersonead6b532011-12-20 17:23:42 -06008870 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008871 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008872 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01008873 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008874 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008875 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008876 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02008877 else
Victor Stinner41a863c2012-02-24 00:37:51 +01008878 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008879 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008880 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008881 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008882 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008883 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008884 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008885 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008886 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008887 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008888 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008889 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01008890 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008891 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01008892 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01008893 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01008894 break;
8895 default:
8896 assert(0);
8897 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008898 }
Victor Stinner90f50d42012-02-24 01:44:47 +01008899 if (unicode != NULL && thousands_sep_kind != kind) {
8900 if (thousands_sep_kind < kind)
8901 PyMem_Free(thousands_sep_data);
8902 else
8903 PyMem_Free(data);
8904 }
Victor Stinner41a863c2012-02-24 00:37:51 +01008905 if (unicode == NULL) {
8906 *maxchar = 127;
8907 if (len != n_digits) {
Victor Stinnere6abb482012-05-02 01:15:40 +02008908 *maxchar = MAX_MAXCHAR(*maxchar,
8909 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01008910 }
8911 }
8912 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008913}
8914
8915
Thomas Wouters477c8d52006-05-27 19:21:47 +00008916/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008917#define ADJUST_INDICES(start, end, len) \
8918 if (end > len) \
8919 end = len; \
8920 else if (end < 0) { \
8921 end += len; \
8922 if (end < 0) \
8923 end = 0; \
8924 } \
8925 if (start < 0) { \
8926 start += len; \
8927 if (start < 0) \
8928 start = 0; \
8929 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008930
Alexander Belopolsky40018472011-02-26 01:02:56 +00008931Py_ssize_t
8932PyUnicode_Count(PyObject *str,
8933 PyObject *substr,
8934 Py_ssize_t start,
8935 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008937 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008938 PyObject* str_obj;
8939 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008940 int kind1, kind2, kind;
8941 void *buf1 = NULL, *buf2 = NULL;
8942 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008943
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008944 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008945 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02008947 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06008948 if (!sub_obj) {
8949 Py_DECREF(str_obj);
8950 return -1;
8951 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06008952 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06008953 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 Py_DECREF(str_obj);
8955 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956 }
Tim Petersced69f82003-09-16 20:30:58 +00008957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 kind1 = PyUnicode_KIND(str_obj);
8959 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008960 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008963 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02008964 if (kind2 > kind) {
8965 Py_DECREF(sub_obj);
8966 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02008967 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02008968 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01008969 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05008970 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971 if (!buf2)
8972 goto onError;
8973 len1 = PyUnicode_GET_LENGTH(str_obj);
8974 len2 = PyUnicode_GET_LENGTH(sub_obj);
8975
8976 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06008977 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008979 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8980 result = asciilib_count(
8981 ((Py_UCS1*)buf1) + start, end - start,
8982 buf2, len2, PY_SSIZE_T_MAX
8983 );
8984 else
8985 result = ucs1lib_count(
8986 ((Py_UCS1*)buf1) + start, end - start,
8987 buf2, len2, PY_SSIZE_T_MAX
8988 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008989 break;
8990 case PyUnicode_2BYTE_KIND:
8991 result = ucs2lib_count(
8992 ((Py_UCS2*)buf1) + start, end - start,
8993 buf2, len2, PY_SSIZE_T_MAX
8994 );
8995 break;
8996 case PyUnicode_4BYTE_KIND:
8997 result = ucs4lib_count(
8998 ((Py_UCS4*)buf1) + start, end - start,
8999 buf2, len2, PY_SSIZE_T_MAX
9000 );
9001 break;
9002 default:
9003 assert(0); result = 0;
9004 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009005
9006 Py_DECREF(sub_obj);
9007 Py_DECREF(str_obj);
9008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 if (kind2 != kind)
9010 PyMem_Free(buf2);
9011
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 onError:
9014 Py_DECREF(sub_obj);
9015 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009016 if (kind2 != kind && buf2)
9017 PyMem_Free(buf2);
9018 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019}
9020
Alexander Belopolsky40018472011-02-26 01:02:56 +00009021Py_ssize_t
9022PyUnicode_Find(PyObject *str,
9023 PyObject *sub,
9024 Py_ssize_t start,
9025 Py_ssize_t end,
9026 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009028 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009029
Guido van Rossumd57fd912000-03-10 22:53:23 +00009030 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009031 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009032 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009033 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009034 if (!sub) {
9035 Py_DECREF(str);
9036 return -2;
9037 }
9038 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9039 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 Py_DECREF(str);
9041 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042 }
Tim Petersced69f82003-09-16 20:30:58 +00009043
Victor Stinner794d5672011-10-10 03:21:36 +02009044 result = any_find_slice(direction,
9045 str, sub, start, end
9046 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009047
Guido van Rossumd57fd912000-03-10 22:53:23 +00009048 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009049 Py_DECREF(sub);
9050
Guido van Rossumd57fd912000-03-10 22:53:23 +00009051 return result;
9052}
9053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054Py_ssize_t
9055PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9056 Py_ssize_t start, Py_ssize_t end,
9057 int direction)
9058{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009060 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 if (PyUnicode_READY(str) == -1)
9062 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009063 if (start < 0 || end < 0) {
9064 PyErr_SetString(PyExc_IndexError, "string index out of range");
9065 return -2;
9066 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067 if (end > PyUnicode_GET_LENGTH(str))
9068 end = PyUnicode_GET_LENGTH(str);
9069 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009070 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9071 kind, end-start, ch, direction);
9072 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009073 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009074 else
9075 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076}
9077
Alexander Belopolsky40018472011-02-26 01:02:56 +00009078static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009079tailmatch(PyObject *self,
9080 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009081 Py_ssize_t start,
9082 Py_ssize_t end,
9083 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009084{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 int kind_self;
9086 int kind_sub;
9087 void *data_self;
9088 void *data_sub;
9089 Py_ssize_t offset;
9090 Py_ssize_t i;
9091 Py_ssize_t end_sub;
9092
9093 if (PyUnicode_READY(self) == -1 ||
9094 PyUnicode_READY(substring) == -1)
9095 return 0;
9096
9097 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098 return 1;
9099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9101 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009103 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009105 kind_self = PyUnicode_KIND(self);
9106 data_self = PyUnicode_DATA(self);
9107 kind_sub = PyUnicode_KIND(substring);
9108 data_sub = PyUnicode_DATA(substring);
9109 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9110
9111 if (direction > 0)
9112 offset = end;
9113 else
9114 offset = start;
9115
9116 if (PyUnicode_READ(kind_self, data_self, offset) ==
9117 PyUnicode_READ(kind_sub, data_sub, 0) &&
9118 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9119 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9120 /* If both are of the same kind, memcmp is sufficient */
9121 if (kind_self == kind_sub) {
9122 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009123 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009124 data_sub,
9125 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009126 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127 }
9128 /* otherwise we have to compare each character by first accesing it */
9129 else {
9130 /* We do not need to compare 0 and len(substring)-1 because
9131 the if statement above ensured already that they are equal
9132 when we end up here. */
Antoine Pitrou057119b2012-09-02 17:56:33 +02009133 /* TODO: honor direction and do a forward or backwards search */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009134 for (i = 1; i < end_sub; ++i) {
9135 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9136 PyUnicode_READ(kind_sub, data_sub, i))
9137 return 0;
9138 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009140 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009141 }
9142
9143 return 0;
9144}
9145
Alexander Belopolsky40018472011-02-26 01:02:56 +00009146Py_ssize_t
9147PyUnicode_Tailmatch(PyObject *str,
9148 PyObject *substr,
9149 Py_ssize_t start,
9150 Py_ssize_t end,
9151 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009153 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009154
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155 str = PyUnicode_FromObject(str);
9156 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009157 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158 substr = PyUnicode_FromObject(substr);
9159 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009160 Py_DECREF(str);
9161 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009162 }
Tim Petersced69f82003-09-16 20:30:58 +00009163
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009164 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009165 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009166 Py_DECREF(str);
9167 Py_DECREF(substr);
9168 return result;
9169}
9170
Guido van Rossumd57fd912000-03-10 22:53:23 +00009171/* Apply fixfct filter to the Unicode object self and return a
9172 reference to the modified object */
9173
Alexander Belopolsky40018472011-02-26 01:02:56 +00009174static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009175fixup(PyObject *self,
9176 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009177{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009178 PyObject *u;
9179 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009180 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009182 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009184 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009185 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 /* fix functions return the new maximum character in a string,
9188 if the kind of the resulting unicode object does not change,
9189 everything is fine. Otherwise we need to change the string kind
9190 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009191 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009192
9193 if (maxchar_new == 0) {
9194 /* no changes */;
9195 if (PyUnicode_CheckExact(self)) {
9196 Py_DECREF(u);
9197 Py_INCREF(self);
9198 return self;
9199 }
9200 else
9201 return u;
9202 }
9203
Victor Stinnere6abb482012-05-02 01:15:40 +02009204 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009205
Victor Stinnereaab6042011-12-11 22:22:39 +01009206 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009207 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009208
9209 /* In case the maximum character changed, we need to
9210 convert the string to the new category. */
9211 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9212 if (v == NULL) {
9213 Py_DECREF(u);
9214 return NULL;
9215 }
9216 if (maxchar_new > maxchar_old) {
9217 /* If the maxchar increased so that the kind changed, not all
9218 characters are representable anymore and we need to fix the
9219 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009220 _PyUnicode_FastCopyCharacters(v, 0,
9221 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009222 maxchar_old = fixfct(v);
9223 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009224 }
9225 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009226 _PyUnicode_FastCopyCharacters(v, 0,
9227 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009228 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009229 Py_DECREF(u);
9230 assert(_PyUnicode_CheckConsistency(v, 1));
9231 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232}
9233
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009234static PyObject *
9235ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009237 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9238 char *resdata, *data = PyUnicode_DATA(self);
9239 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009240
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009241 res = PyUnicode_New(len, 127);
9242 if (res == NULL)
9243 return NULL;
9244 resdata = PyUnicode_DATA(res);
9245 if (lower)
9246 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009248 _Py_bytes_upper(resdata, data, len);
9249 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009250}
9251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009252static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009253handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009254{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009255 Py_ssize_t j;
9256 int final_sigma;
9257 Py_UCS4 c;
9258 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009259
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009260 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9261
9262 where ! is a negation and \p{xxx} is a character with property xxx.
9263 */
9264 for (j = i - 1; j >= 0; j--) {
9265 c = PyUnicode_READ(kind, data, j);
9266 if (!_PyUnicode_IsCaseIgnorable(c))
9267 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009268 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009269 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9270 if (final_sigma) {
9271 for (j = i + 1; j < length; j++) {
9272 c = PyUnicode_READ(kind, data, j);
9273 if (!_PyUnicode_IsCaseIgnorable(c))
9274 break;
9275 }
9276 final_sigma = j == length || !_PyUnicode_IsCased(c);
9277 }
9278 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279}
9280
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009281static int
9282lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9283 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009284{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009285 /* Obscure special case. */
9286 if (c == 0x3A3) {
9287 mapped[0] = handle_capital_sigma(kind, data, length, i);
9288 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009290 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009291}
9292
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009293static Py_ssize_t
9294do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009295{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009296 Py_ssize_t i, k = 0;
9297 int n_res, j;
9298 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009299
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009300 c = PyUnicode_READ(kind, data, 0);
9301 n_res = _PyUnicode_ToUpperFull(c, mapped);
9302 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009303 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009304 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009306 for (i = 1; i < length; i++) {
9307 c = PyUnicode_READ(kind, data, i);
9308 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9309 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009310 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009311 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009312 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009313 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009314 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009315}
9316
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009317static Py_ssize_t
9318do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9319 Py_ssize_t i, k = 0;
9320
9321 for (i = 0; i < length; i++) {
9322 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9323 int n_res, j;
9324 if (Py_UNICODE_ISUPPER(c)) {
9325 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9326 }
9327 else if (Py_UNICODE_ISLOWER(c)) {
9328 n_res = _PyUnicode_ToUpperFull(c, mapped);
9329 }
9330 else {
9331 n_res = 1;
9332 mapped[0] = c;
9333 }
9334 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009335 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009336 res[k++] = mapped[j];
9337 }
9338 }
9339 return k;
9340}
9341
9342static Py_ssize_t
9343do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9344 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009345{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009346 Py_ssize_t i, k = 0;
9347
9348 for (i = 0; i < length; i++) {
9349 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9350 int n_res, j;
9351 if (lower)
9352 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9353 else
9354 n_res = _PyUnicode_ToUpperFull(c, mapped);
9355 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009356 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009357 res[k++] = mapped[j];
9358 }
9359 }
9360 return k;
9361}
9362
9363static Py_ssize_t
9364do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9365{
9366 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9367}
9368
9369static Py_ssize_t
9370do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9371{
9372 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9373}
9374
Benjamin Petersone51757f2012-01-12 21:10:29 -05009375static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009376do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9377{
9378 Py_ssize_t i, k = 0;
9379
9380 for (i = 0; i < length; i++) {
9381 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9382 Py_UCS4 mapped[3];
9383 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9384 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009385 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009386 res[k++] = mapped[j];
9387 }
9388 }
9389 return k;
9390}
9391
9392static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009393do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9394{
9395 Py_ssize_t i, k = 0;
9396 int previous_is_cased;
9397
9398 previous_is_cased = 0;
9399 for (i = 0; i < length; i++) {
9400 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9401 Py_UCS4 mapped[3];
9402 int n_res, j;
9403
9404 if (previous_is_cased)
9405 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9406 else
9407 n_res = _PyUnicode_ToTitleFull(c, mapped);
9408
9409 for (j = 0; j < n_res; j++) {
Victor Stinnere6abb482012-05-02 01:15:40 +02009410 *maxchar = MAX_MAXCHAR(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009411 res[k++] = mapped[j];
9412 }
9413
9414 previous_is_cased = _PyUnicode_IsCased(c);
9415 }
9416 return k;
9417}
9418
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009419static PyObject *
9420case_operation(PyObject *self,
9421 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9422{
9423 PyObject *res = NULL;
9424 Py_ssize_t length, newlength = 0;
9425 int kind, outkind;
9426 void *data, *outdata;
9427 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9428
Benjamin Petersoneea48462012-01-16 14:28:50 -05009429 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009430
9431 kind = PyUnicode_KIND(self);
9432 data = PyUnicode_DATA(self);
9433 length = PyUnicode_GET_LENGTH(self);
9434 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9435 if (tmp == NULL)
9436 return PyErr_NoMemory();
9437 newlength = perform(kind, data, length, tmp, &maxchar);
9438 res = PyUnicode_New(newlength, maxchar);
9439 if (res == NULL)
9440 goto leave;
9441 tmpend = tmp + newlength;
9442 outdata = PyUnicode_DATA(res);
9443 outkind = PyUnicode_KIND(res);
9444 switch (outkind) {
9445 case PyUnicode_1BYTE_KIND:
9446 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9447 break;
9448 case PyUnicode_2BYTE_KIND:
9449 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9450 break;
9451 case PyUnicode_4BYTE_KIND:
9452 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9453 break;
9454 default:
9455 assert(0);
9456 break;
9457 }
9458 leave:
9459 PyMem_FREE(tmp);
9460 return res;
9461}
9462
Tim Peters8ce9f162004-08-27 01:49:32 +00009463PyObject *
9464PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009465{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009467 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009468 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009469 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009470 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9471 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009472 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009474 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009476 int use_memcpy;
9477 unsigned char *res_data = NULL, *sep_data = NULL;
9478 PyObject *last_obj;
9479 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480
Tim Peters05eba1f2004-08-27 21:32:02 +00009481 fseq = PySequence_Fast(seq, "");
9482 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009483 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009484 }
9485
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009486 /* NOTE: the following code can't call back into Python code,
9487 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009488 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009489
Tim Peters05eba1f2004-08-27 21:32:02 +00009490 seqlen = PySequence_Fast_GET_SIZE(fseq);
9491 /* If empty sequence, return u"". */
9492 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009493 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009494 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009495 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009496
Tim Peters05eba1f2004-08-27 21:32:02 +00009497 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009498 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009499 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009500 if (seqlen == 1) {
9501 if (PyUnicode_CheckExact(items[0])) {
9502 res = items[0];
9503 Py_INCREF(res);
9504 Py_DECREF(fseq);
9505 return res;
9506 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009507 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009508 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009509 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009510 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009511 /* Set up sep and seplen */
9512 if (separator == NULL) {
9513 /* fall back to a blank space separator */
9514 sep = PyUnicode_FromOrdinal(' ');
9515 if (!sep)
9516 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009517 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009518 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009519 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009520 else {
9521 if (!PyUnicode_Check(separator)) {
9522 PyErr_Format(PyExc_TypeError,
9523 "separator: expected str instance,"
9524 " %.80s found",
9525 Py_TYPE(separator)->tp_name);
9526 goto onError;
9527 }
9528 if (PyUnicode_READY(separator))
9529 goto onError;
9530 sep = separator;
9531 seplen = PyUnicode_GET_LENGTH(separator);
9532 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9533 /* inc refcount to keep this code path symmetric with the
9534 above case of a blank separator */
9535 Py_INCREF(sep);
9536 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009537 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009538 }
9539
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009540 /* There are at least two things to join, or else we have a subclass
9541 * of str in the sequence.
9542 * Do a pre-pass to figure out the total amount of space we'll
9543 * need (sz), and see whether all argument are strings.
9544 */
9545 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009546#ifdef Py_DEBUG
9547 use_memcpy = 0;
9548#else
9549 use_memcpy = 1;
9550#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009551 for (i = 0; i < seqlen; i++) {
9552 const Py_ssize_t old_sz = sz;
9553 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009554 if (!PyUnicode_Check(item)) {
9555 PyErr_Format(PyExc_TypeError,
9556 "sequence item %zd: expected str instance,"
9557 " %.80s found",
9558 i, Py_TYPE(item)->tp_name);
9559 goto onError;
9560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 if (PyUnicode_READY(item) == -1)
9562 goto onError;
9563 sz += PyUnicode_GET_LENGTH(item);
9564 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnere6abb482012-05-02 01:15:40 +02009565 maxchar = MAX_MAXCHAR(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009566 if (i != 0)
9567 sz += seplen;
9568 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9569 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009570 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009571 goto onError;
9572 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009573 if (use_memcpy && last_obj != NULL) {
9574 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9575 use_memcpy = 0;
9576 }
9577 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009578 }
Tim Petersced69f82003-09-16 20:30:58 +00009579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009580 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009581 if (res == NULL)
9582 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009583
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009584 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009585#ifdef Py_DEBUG
9586 use_memcpy = 0;
9587#else
9588 if (use_memcpy) {
9589 res_data = PyUnicode_1BYTE_DATA(res);
9590 kind = PyUnicode_KIND(res);
9591 if (seplen != 0)
9592 sep_data = PyUnicode_1BYTE_DATA(sep);
9593 }
9594#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009596 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009597 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009598 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009599 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009600 if (use_memcpy) {
9601 Py_MEMCPY(res_data,
9602 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009603 kind * seplen);
9604 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009605 }
9606 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009607 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009608 res_offset += seplen;
9609 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009610 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009611 itemlen = PyUnicode_GET_LENGTH(item);
9612 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009613 if (use_memcpy) {
9614 Py_MEMCPY(res_data,
9615 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009616 kind * itemlen);
9617 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009618 }
9619 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009620 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009621 res_offset += itemlen;
9622 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009623 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009624 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009625 if (use_memcpy)
9626 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009627 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009628 else
9629 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009630
Tim Peters05eba1f2004-08-27 21:32:02 +00009631 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009633 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009635
Benjamin Peterson29060642009-01-31 22:14:21 +00009636 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009637 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009639 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009640 return NULL;
9641}
9642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009643#define FILL(kind, data, value, start, length) \
9644 do { \
9645 Py_ssize_t i_ = 0; \
9646 assert(kind != PyUnicode_WCHAR_KIND); \
9647 switch ((kind)) { \
9648 case PyUnicode_1BYTE_KIND: { \
9649 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009650 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 break; \
9652 } \
9653 case PyUnicode_2BYTE_KIND: { \
9654 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9655 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9656 break; \
9657 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009658 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9660 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9661 break; \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009662 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009663 } \
9664 } \
9665 } while (0)
9666
Victor Stinnerd3f08822012-05-29 12:57:52 +02009667void
9668_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9669 Py_UCS4 fill_char)
9670{
9671 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9672 const void *data = PyUnicode_DATA(unicode);
9673 assert(PyUnicode_IS_READY(unicode));
9674 assert(unicode_modifiable(unicode));
9675 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9676 assert(start >= 0);
9677 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9678 FILL(kind, data, fill_char, start, length);
9679}
9680
Victor Stinner3fe55312012-01-04 00:33:50 +01009681Py_ssize_t
9682PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9683 Py_UCS4 fill_char)
9684{
9685 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009686
9687 if (!PyUnicode_Check(unicode)) {
9688 PyErr_BadInternalCall();
9689 return -1;
9690 }
9691 if (PyUnicode_READY(unicode) == -1)
9692 return -1;
9693 if (unicode_check_modifiable(unicode))
9694 return -1;
9695
Victor Stinnerd3f08822012-05-29 12:57:52 +02009696 if (start < 0) {
9697 PyErr_SetString(PyExc_IndexError, "string index out of range");
9698 return -1;
9699 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009700 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9701 PyErr_SetString(PyExc_ValueError,
9702 "fill character is bigger than "
9703 "the string maximum character");
9704 return -1;
9705 }
9706
9707 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9708 length = Py_MIN(maxlen, length);
9709 if (length <= 0)
9710 return 0;
9711
Victor Stinnerd3f08822012-05-29 12:57:52 +02009712 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009713 return length;
9714}
9715
Victor Stinner9310abb2011-10-05 00:59:23 +02009716static PyObject *
9717pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009718 Py_ssize_t left,
9719 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009722 PyObject *u;
9723 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009724 int kind;
9725 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726
9727 if (left < 0)
9728 left = 0;
9729 if (right < 0)
9730 right = 0;
9731
Victor Stinnerc4b49542011-12-11 22:44:26 +01009732 if (left == 0 && right == 0)
9733 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9736 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009737 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9738 return NULL;
9739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02009741 maxchar = MAX_MAXCHAR(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009742 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009743 if (!u)
9744 return NULL;
9745
9746 kind = PyUnicode_KIND(u);
9747 data = PyUnicode_DATA(u);
9748 if (left)
9749 FILL(kind, data, fill, 0, left);
9750 if (right)
9751 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009752 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009753 assert(_PyUnicode_CheckConsistency(u, 1));
9754 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009755}
9756
Alexander Belopolsky40018472011-02-26 01:02:56 +00009757PyObject *
9758PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009759{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761
9762 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009763 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009764 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009765 if (PyUnicode_READY(string) == -1) {
9766 Py_DECREF(string);
9767 return NULL;
9768 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009769
Benjamin Petersonead6b532011-12-20 17:23:42 -06009770 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009771 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009772 if (PyUnicode_IS_ASCII(string))
9773 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009774 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009775 PyUnicode_GET_LENGTH(string), keepends);
9776 else
9777 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009778 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009779 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 break;
9781 case PyUnicode_2BYTE_KIND:
9782 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009783 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 PyUnicode_GET_LENGTH(string), keepends);
9785 break;
9786 case PyUnicode_4BYTE_KIND:
9787 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009788 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789 PyUnicode_GET_LENGTH(string), keepends);
9790 break;
9791 default:
9792 assert(0);
9793 list = 0;
9794 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795 Py_DECREF(string);
9796 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009797}
9798
Alexander Belopolsky40018472011-02-26 01:02:56 +00009799static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009800split(PyObject *self,
9801 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009802 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 int kind1, kind2, kind;
9805 void *buf1, *buf2;
9806 Py_ssize_t len1, len2;
9807 PyObject* out;
9808
Guido van Rossumd57fd912000-03-10 22:53:23 +00009809 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009810 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009812 if (PyUnicode_READY(self) == -1)
9813 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009816 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009818 if (PyUnicode_IS_ASCII(self))
9819 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009820 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009821 PyUnicode_GET_LENGTH(self), maxcount
9822 );
9823 else
9824 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009825 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009826 PyUnicode_GET_LENGTH(self), maxcount
9827 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 case PyUnicode_2BYTE_KIND:
9829 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009830 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 PyUnicode_GET_LENGTH(self), maxcount
9832 );
9833 case PyUnicode_4BYTE_KIND:
9834 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009835 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009836 PyUnicode_GET_LENGTH(self), maxcount
9837 );
9838 default:
9839 assert(0);
9840 return NULL;
9841 }
9842
9843 if (PyUnicode_READY(substring) == -1)
9844 return NULL;
9845
9846 kind1 = PyUnicode_KIND(self);
9847 kind2 = PyUnicode_KIND(substring);
9848 kind = kind1 > kind2 ? kind1 : kind2;
9849 buf1 = PyUnicode_DATA(self);
9850 buf2 = PyUnicode_DATA(substring);
9851 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009852 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 if (!buf1)
9854 return NULL;
9855 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009856 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 if (!buf2) {
9858 if (kind1 != kind) PyMem_Free(buf1);
9859 return NULL;
9860 }
9861 len1 = PyUnicode_GET_LENGTH(self);
9862 len2 = PyUnicode_GET_LENGTH(substring);
9863
Benjamin Petersonead6b532011-12-20 17:23:42 -06009864 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009866 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9867 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009868 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009869 else
9870 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009871 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 break;
9873 case PyUnicode_2BYTE_KIND:
9874 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009875 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 break;
9877 case PyUnicode_4BYTE_KIND:
9878 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009879 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 break;
9881 default:
9882 out = NULL;
9883 }
9884 if (kind1 != kind)
9885 PyMem_Free(buf1);
9886 if (kind2 != kind)
9887 PyMem_Free(buf2);
9888 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009889}
9890
Alexander Belopolsky40018472011-02-26 01:02:56 +00009891static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009892rsplit(PyObject *self,
9893 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009894 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009895{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 int kind1, kind2, kind;
9897 void *buf1, *buf2;
9898 Py_ssize_t len1, len2;
9899 PyObject* out;
9900
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009901 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009902 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009904 if (PyUnicode_READY(self) == -1)
9905 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -06009908 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009909 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009910 if (PyUnicode_IS_ASCII(self))
9911 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009912 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009913 PyUnicode_GET_LENGTH(self), maxcount
9914 );
9915 else
9916 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009917 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009918 PyUnicode_GET_LENGTH(self), maxcount
9919 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009920 case PyUnicode_2BYTE_KIND:
9921 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009922 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 PyUnicode_GET_LENGTH(self), maxcount
9924 );
9925 case PyUnicode_4BYTE_KIND:
9926 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009927 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009928 PyUnicode_GET_LENGTH(self), maxcount
9929 );
9930 default:
9931 assert(0);
9932 return NULL;
9933 }
9934
9935 if (PyUnicode_READY(substring) == -1)
9936 return NULL;
9937
9938 kind1 = PyUnicode_KIND(self);
9939 kind2 = PyUnicode_KIND(substring);
9940 kind = kind1 > kind2 ? kind1 : kind2;
9941 buf1 = PyUnicode_DATA(self);
9942 buf2 = PyUnicode_DATA(substring);
9943 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009944 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 if (!buf1)
9946 return NULL;
9947 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009948 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 if (!buf2) {
9950 if (kind1 != kind) PyMem_Free(buf1);
9951 return NULL;
9952 }
9953 len1 = PyUnicode_GET_LENGTH(self);
9954 len2 = PyUnicode_GET_LENGTH(substring);
9955
Benjamin Petersonead6b532011-12-20 17:23:42 -06009956 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009957 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009958 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9959 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009960 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009961 else
9962 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009963 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 break;
9965 case PyUnicode_2BYTE_KIND:
9966 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009967 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 break;
9969 case PyUnicode_4BYTE_KIND:
9970 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009971 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 break;
9973 default:
9974 out = NULL;
9975 }
9976 if (kind1 != kind)
9977 PyMem_Free(buf1);
9978 if (kind2 != kind)
9979 PyMem_Free(buf2);
9980 return out;
9981}
9982
9983static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009984anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9985 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009986{
Benjamin Petersonead6b532011-12-20 17:23:42 -06009987 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009989 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9990 return asciilib_find(buf1, len1, buf2, len2, offset);
9991 else
9992 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 case PyUnicode_2BYTE_KIND:
9994 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9995 case PyUnicode_4BYTE_KIND:
9996 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9997 }
9998 assert(0);
9999 return -1;
10000}
10001
10002static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010003anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10004 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010006 switch (kind) {
10007 case PyUnicode_1BYTE_KIND:
10008 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10009 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10010 else
10011 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10012 case PyUnicode_2BYTE_KIND:
10013 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10014 case PyUnicode_4BYTE_KIND:
10015 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10016 }
10017 assert(0);
10018 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010019}
10020
Alexander Belopolsky40018472011-02-26 01:02:56 +000010021static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010022replace(PyObject *self, PyObject *str1,
10023 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010024{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 PyObject *u;
10026 char *sbuf = PyUnicode_DATA(self);
10027 char *buf1 = PyUnicode_DATA(str1);
10028 char *buf2 = PyUnicode_DATA(str2);
10029 int srelease = 0, release1 = 0, release2 = 0;
10030 int skind = PyUnicode_KIND(self);
10031 int kind1 = PyUnicode_KIND(str1);
10032 int kind2 = PyUnicode_KIND(str2);
10033 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10034 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10035 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010036 int mayshrink;
10037 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038
10039 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010040 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010042 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010043
Victor Stinner59de0ee2011-10-07 10:01:28 +020010044 if (str1 == str2)
10045 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046 if (skind < kind1)
10047 /* substring too wide to be present */
10048 goto nothing;
10049
Victor Stinner49a0a212011-10-12 23:46:10 +020010050 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10051 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10052 /* Replacing str1 with str2 may cause a maxchar reduction in the
10053 result string. */
10054 mayshrink = (maxchar_str2 < maxchar);
Victor Stinnere6abb482012-05-02 01:15:40 +020010055 maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010058 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010060 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010062 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010063 Py_UCS4 u1, u2;
10064 int rkind;
Victor Stinnerf6441102011-12-18 02:43:08 +010010065 Py_ssize_t index, pos;
10066 char *src;
10067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068 u1 = PyUnicode_READ_CHAR(str1, 0);
Victor Stinnerf6441102011-12-18 02:43:08 +010010069 pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
10070 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010071 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010072 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010074 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010075 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010076 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 rkind = PyUnicode_KIND(u);
Victor Stinnerf6441102011-12-18 02:43:08 +010010078
10079 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), pos, u2);
10080 index = 0;
10081 src = sbuf;
10082 while (--maxcount)
10083 {
10084 pos++;
10085 src += pos * PyUnicode_KIND(self);
10086 slen -= pos;
10087 index += pos;
10088 pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
10089 if (pos < 0)
10090 break;
10091 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), index + pos, u2);
10092 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010093 }
10094 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 int rkind = skind;
10096 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010097 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010098
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 if (kind1 < rkind) {
10100 /* widen substring */
10101 buf1 = _PyUnicode_AsKind(str1, rkind);
10102 if (!buf1) goto error;
10103 release1 = 1;
10104 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010105 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010106 if (i < 0)
10107 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 if (rkind > kind2) {
10109 /* widen replacement */
10110 buf2 = _PyUnicode_AsKind(str2, rkind);
10111 if (!buf2) goto error;
10112 release2 = 1;
10113 }
10114 else if (rkind < kind2) {
10115 /* widen self and buf1 */
10116 rkind = kind2;
10117 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010118 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 sbuf = _PyUnicode_AsKind(self, rkind);
10120 if (!sbuf) goto error;
10121 srelease = 1;
10122 buf1 = _PyUnicode_AsKind(str1, rkind);
10123 if (!buf1) goto error;
10124 release1 = 1;
10125 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010126 u = PyUnicode_New(slen, maxchar);
10127 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010129 assert(PyUnicode_KIND(u) == rkind);
10130 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010131
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010132 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010133 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010134 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010135 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010136 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010138
10139 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010140 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010141 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010142 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010143 if (i == -1)
10144 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010145 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010147 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010150 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010151 }
10152 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010154 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 int rkind = skind;
10156 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010159 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 buf1 = _PyUnicode_AsKind(str1, rkind);
10161 if (!buf1) goto error;
10162 release1 = 1;
10163 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010164 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010165 if (n == 0)
10166 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010168 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 buf2 = _PyUnicode_AsKind(str2, rkind);
10170 if (!buf2) goto error;
10171 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010174 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 rkind = kind2;
10176 sbuf = _PyUnicode_AsKind(self, rkind);
10177 if (!sbuf) goto error;
10178 srelease = 1;
10179 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010180 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 buf1 = _PyUnicode_AsKind(str1, rkind);
10182 if (!buf1) goto error;
10183 release1 = 1;
10184 }
10185 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10186 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010187 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 PyErr_SetString(PyExc_OverflowError,
10189 "replace string is too long");
10190 goto error;
10191 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010192 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010193 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010194 _Py_INCREF_UNICODE_EMPTY();
10195 if (!unicode_empty)
10196 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010197 u = unicode_empty;
10198 goto done;
10199 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010200 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 PyErr_SetString(PyExc_OverflowError,
10202 "replace string is too long");
10203 goto error;
10204 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010205 u = PyUnicode_New(new_size, maxchar);
10206 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010208 assert(PyUnicode_KIND(u) == rkind);
10209 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 ires = i = 0;
10211 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010212 while (n-- > 0) {
10213 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010214 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010215 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010216 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010217 if (j == -1)
10218 break;
10219 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010220 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010221 memcpy(res + rkind * ires,
10222 sbuf + rkind * i,
10223 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010225 }
10226 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010228 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010230 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010232 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010236 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010237 memcpy(res + rkind * ires,
10238 sbuf + rkind * i,
10239 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010240 }
10241 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010242 /* interleave */
10243 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010244 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010246 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010248 if (--n <= 0)
10249 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010250 memcpy(res + rkind * ires,
10251 sbuf + rkind * i,
10252 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 ires++;
10254 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010255 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010256 memcpy(res + rkind * ires,
10257 sbuf + rkind * i,
10258 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010259 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010260 }
10261
10262 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010263 unicode_adjust_maxchar(&u);
10264 if (u == NULL)
10265 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010267
10268 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 if (srelease)
10270 PyMem_FREE(sbuf);
10271 if (release1)
10272 PyMem_FREE(buf1);
10273 if (release2)
10274 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010275 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010276 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010277
Benjamin Peterson29060642009-01-31 22:14:21 +000010278 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010279 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 if (srelease)
10281 PyMem_FREE(sbuf);
10282 if (release1)
10283 PyMem_FREE(buf1);
10284 if (release2)
10285 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010286 return unicode_result_unchanged(self);
10287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 error:
10289 if (srelease && sbuf)
10290 PyMem_FREE(sbuf);
10291 if (release1 && buf1)
10292 PyMem_FREE(buf1);
10293 if (release2 && buf2)
10294 PyMem_FREE(buf2);
10295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010296}
10297
10298/* --- Unicode Object Methods --------------------------------------------- */
10299
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010300PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010301 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010302\n\
10303Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010304characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010305
10306static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010307unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010308{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010309 if (PyUnicode_READY(self) == -1)
10310 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010311 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010312}
10313
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010314PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010315 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316\n\
10317Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010318have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319
10320static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010321unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010323 if (PyUnicode_READY(self) == -1)
10324 return NULL;
10325 if (PyUnicode_GET_LENGTH(self) == 0)
10326 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010327 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328}
10329
Benjamin Petersond5890c82012-01-14 13:23:30 -050010330PyDoc_STRVAR(casefold__doc__,
10331 "S.casefold() -> str\n\
10332\n\
10333Return a version of S suitable for caseless comparisons.");
10334
10335static PyObject *
10336unicode_casefold(PyObject *self)
10337{
10338 if (PyUnicode_READY(self) == -1)
10339 return NULL;
10340 if (PyUnicode_IS_ASCII(self))
10341 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010342 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010343}
10344
10345
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010346/* Argument converter. Coerces to a single unicode character */
10347
10348static int
10349convert_uc(PyObject *obj, void *addr)
10350{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010352 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010353
Benjamin Peterson14339b62009-01-31 16:36:08 +000010354 uniobj = PyUnicode_FromObject(obj);
10355 if (uniobj == NULL) {
10356 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010357 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010358 return 0;
10359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010361 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010362 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010363 Py_DECREF(uniobj);
10364 return 0;
10365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010367 Py_DECREF(uniobj);
10368 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010369}
10370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010371PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010372 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010373\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010374Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010375done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010376
10377static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010378unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010380 Py_ssize_t marg, left;
10381 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 Py_UCS4 fillchar = ' ';
10383
Victor Stinnere9a29352011-10-01 02:14:59 +020010384 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386
Benjamin Petersonbac79492012-01-14 13:34:47 -050010387 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010388 return NULL;
10389
Victor Stinnerc4b49542011-12-11 22:44:26 +010010390 if (PyUnicode_GET_LENGTH(self) >= width)
10391 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392
Victor Stinnerc4b49542011-12-11 22:44:26 +010010393 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394 left = marg / 2 + (marg & width & 1);
10395
Victor Stinner9310abb2011-10-05 00:59:23 +020010396 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397}
10398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399/* This function assumes that str1 and str2 are readied by the caller. */
10400
Marc-André Lemburge5034372000-08-08 08:04:29 +000010401static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010402unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010403{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 int kind1, kind2;
10405 void *data1, *data2;
10406 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 kind1 = PyUnicode_KIND(str1);
10409 kind2 = PyUnicode_KIND(str2);
10410 data1 = PyUnicode_DATA(str1);
10411 data2 = PyUnicode_DATA(str2);
10412 len1 = PyUnicode_GET_LENGTH(str1);
10413 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 for (i = 0; i < len1 && i < len2; ++i) {
10416 Py_UCS4 c1, c2;
10417 c1 = PyUnicode_READ(kind1, data1, i);
10418 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010419
10420 if (c1 != c2)
10421 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010422 }
10423
10424 return (len1 < len2) ? -1 : (len1 != len2);
10425}
10426
Alexander Belopolsky40018472011-02-26 01:02:56 +000010427int
10428PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010429{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10431 if (PyUnicode_READY(left) == -1 ||
10432 PyUnicode_READY(right) == -1)
10433 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010434 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010436 PyErr_Format(PyExc_TypeError,
10437 "Can't compare %.100s and %.100s",
10438 left->ob_type->tp_name,
10439 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010440 return -1;
10441}
10442
Martin v. Löwis5b222132007-06-10 09:51:05 +000010443int
10444PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10445{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 Py_ssize_t i;
10447 int kind;
10448 void *data;
10449 Py_UCS4 chr;
10450
Victor Stinner910337b2011-10-03 03:20:16 +020010451 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 if (PyUnicode_READY(uni) == -1)
10453 return -1;
10454 kind = PyUnicode_KIND(uni);
10455 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010456 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10458 if (chr != str[i])
10459 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010460 /* This check keeps Python strings that end in '\0' from comparing equal
10461 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010463 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010464 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010465 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010466 return 0;
10467}
10468
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010469
Benjamin Peterson29060642009-01-31 22:14:21 +000010470#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010471 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010472
Alexander Belopolsky40018472011-02-26 01:02:56 +000010473PyObject *
10474PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010475{
10476 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010477
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010478 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10479 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 if (PyUnicode_READY(left) == -1 ||
10481 PyUnicode_READY(right) == -1)
10482 return NULL;
10483 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10484 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010485 if (op == Py_EQ) {
10486 Py_INCREF(Py_False);
10487 return Py_False;
10488 }
10489 if (op == Py_NE) {
10490 Py_INCREF(Py_True);
10491 return Py_True;
10492 }
10493 }
10494 if (left == right)
10495 result = 0;
10496 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010497 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010498
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010499 /* Convert the return value to a Boolean */
10500 switch (op) {
10501 case Py_EQ:
10502 v = TEST_COND(result == 0);
10503 break;
10504 case Py_NE:
10505 v = TEST_COND(result != 0);
10506 break;
10507 case Py_LE:
10508 v = TEST_COND(result <= 0);
10509 break;
10510 case Py_GE:
10511 v = TEST_COND(result >= 0);
10512 break;
10513 case Py_LT:
10514 v = TEST_COND(result == -1);
10515 break;
10516 case Py_GT:
10517 v = TEST_COND(result == 1);
10518 break;
10519 default:
10520 PyErr_BadArgument();
10521 return NULL;
10522 }
10523 Py_INCREF(v);
10524 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010525 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010526
Brian Curtindfc80e32011-08-10 20:28:54 -050010527 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010528}
10529
Alexander Belopolsky40018472011-02-26 01:02:56 +000010530int
10531PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010532{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010533 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 int kind1, kind2, kind;
10535 void *buf1, *buf2;
10536 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010537 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010538
10539 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010540 sub = PyUnicode_FromObject(element);
10541 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010542 PyErr_Format(PyExc_TypeError,
10543 "'in <string>' requires string as left operand, not %s",
10544 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010545 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010546 }
10547
Thomas Wouters477c8d52006-05-27 19:21:47 +000010548 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010549 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010550 Py_DECREF(sub);
10551 return -1;
10552 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060010553 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
10554 Py_DECREF(sub);
10555 Py_DECREF(str);
10556 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 kind1 = PyUnicode_KIND(str);
10559 kind2 = PyUnicode_KIND(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010560 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 buf1 = PyUnicode_DATA(str);
10562 buf2 = PyUnicode_DATA(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010563 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010564 if (kind2 > kind) {
10565 Py_DECREF(sub);
10566 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010567 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010568 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010010569 buf2 = _PyUnicode_AsKind(sub, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010570 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 if (!buf2) {
10572 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010573 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 return -1;
10575 }
10576 len1 = PyUnicode_GET_LENGTH(str);
10577 len2 = PyUnicode_GET_LENGTH(sub);
10578
Benjamin Petersonead6b532011-12-20 17:23:42 -060010579 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 case PyUnicode_1BYTE_KIND:
10581 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10582 break;
10583 case PyUnicode_2BYTE_KIND:
10584 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10585 break;
10586 case PyUnicode_4BYTE_KIND:
10587 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10588 break;
10589 default:
10590 result = -1;
10591 assert(0);
10592 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010593
10594 Py_DECREF(str);
10595 Py_DECREF(sub);
10596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 if (kind2 != kind)
10598 PyMem_Free(buf2);
10599
Guido van Rossum403d68b2000-03-13 15:55:09 +000010600 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010601}
10602
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603/* Concat to string or Unicode object giving a new Unicode object. */
10604
Alexander Belopolsky40018472011-02-26 01:02:56 +000010605PyObject *
10606PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010609 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010610 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611
10612 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010615 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010618 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619
10620 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010621 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010622 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010624 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010625 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010626 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628 }
10629
Victor Stinner488fa492011-12-12 00:01:39 +010010630 u_len = PyUnicode_GET_LENGTH(u);
10631 v_len = PyUnicode_GET_LENGTH(v);
10632 if (u_len > PY_SSIZE_T_MAX - v_len) {
10633 PyErr_SetString(PyExc_OverflowError,
10634 "strings are too large to concat");
10635 goto onError;
10636 }
10637 new_len = u_len + v_len;
10638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010640 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Victor Stinnere6abb482012-05-02 01:15:40 +020010641 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642
Guido van Rossumd57fd912000-03-10 22:53:23 +000010643 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010010644 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010646 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010647 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10648 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649 Py_DECREF(u);
10650 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010651 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010653
Benjamin Peterson29060642009-01-31 22:14:21 +000010654 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655 Py_XDECREF(u);
10656 Py_XDECREF(v);
10657 return NULL;
10658}
10659
Walter Dörwald1ab83302007-05-18 17:15:44 +000010660void
Victor Stinner23e56682011-10-03 03:54:37 +020010661PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010662{
Victor Stinner23e56682011-10-03 03:54:37 +020010663 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010010664 Py_UCS4 maxchar, maxchar2;
10665 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020010666
10667 if (p_left == NULL) {
10668 if (!PyErr_Occurred())
10669 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010670 return;
10671 }
Victor Stinner23e56682011-10-03 03:54:37 +020010672 left = *p_left;
Serhiy Storchaka6c83e732013-01-04 12:39:34 +020010673 if (right == NULL || left == NULL || !PyUnicode_Check(left)) {
Victor Stinner23e56682011-10-03 03:54:37 +020010674 if (!PyErr_Occurred())
10675 PyErr_BadInternalCall();
10676 goto error;
10677 }
10678
Benjamin Petersonbac79492012-01-14 13:34:47 -050010679 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010680 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050010681 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020010682 goto error;
10683
Victor Stinner488fa492011-12-12 00:01:39 +010010684 /* Shortcuts */
10685 if (left == unicode_empty) {
10686 Py_DECREF(left);
10687 Py_INCREF(right);
10688 *p_left = right;
10689 return;
10690 }
10691 if (right == unicode_empty)
10692 return;
10693
10694 left_len = PyUnicode_GET_LENGTH(left);
10695 right_len = PyUnicode_GET_LENGTH(right);
10696 if (left_len > PY_SSIZE_T_MAX - right_len) {
10697 PyErr_SetString(PyExc_OverflowError,
10698 "strings are too large to concat");
10699 goto error;
10700 }
10701 new_len = left_len + right_len;
10702
10703 if (unicode_modifiable(left)
10704 && PyUnicode_CheckExact(right)
10705 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020010706 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10707 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010708 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010709 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010010710 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10711 {
10712 /* append inplace */
10713 if (unicode_resize(p_left, new_len) != 0) {
10714 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10715 * deallocated so it cannot be put back into
10716 * 'variable'. The MemoryError is raised when there
10717 * is no value in 'variable', which might (very
10718 * remotely) be a cause of incompatibilities.
10719 */
10720 goto error;
Victor Stinner23e56682011-10-03 03:54:37 +020010721 }
Victor Stinner488fa492011-12-12 00:01:39 +010010722 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerd3f08822012-05-29 12:57:52 +020010723 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020010724 }
Victor Stinner488fa492011-12-12 00:01:39 +010010725 else {
10726 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10727 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Victor Stinnere6abb482012-05-02 01:15:40 +020010728 maxchar = MAX_MAXCHAR(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020010729
Victor Stinner488fa492011-12-12 00:01:39 +010010730 /* Concat the two Unicode strings */
10731 res = PyUnicode_New(new_len, maxchar);
10732 if (res == NULL)
10733 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020010734 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10735 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010010736 Py_DECREF(left);
10737 *p_left = res;
10738 }
10739 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010740 return;
10741
10742error:
Victor Stinner488fa492011-12-12 00:01:39 +010010743 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010744}
10745
10746void
10747PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10748{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010749 PyUnicode_Append(pleft, right);
10750 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010751}
10752
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010753PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010754 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010756Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010757string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010758interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759
10760static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010761unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010763 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010764 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010765 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010767 int kind1, kind2, kind;
10768 void *buf1, *buf2;
10769 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770
Jesus Ceaac451502011-04-20 17:09:23 +020010771 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10772 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010773 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 kind1 = PyUnicode_KIND(self);
10776 kind2 = PyUnicode_KIND(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040010777 if (kind2 > kind1)
10778 return PyLong_FromLong(0);
10779 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780 buf1 = PyUnicode_DATA(self);
10781 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010782 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010783 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 if (!buf2) {
10785 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 return NULL;
10787 }
10788 len1 = PyUnicode_GET_LENGTH(self);
10789 len2 = PyUnicode_GET_LENGTH(substring);
10790
10791 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060010792 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 case PyUnicode_1BYTE_KIND:
10794 iresult = ucs1lib_count(
10795 ((Py_UCS1*)buf1) + start, end - start,
10796 buf2, len2, PY_SSIZE_T_MAX
10797 );
10798 break;
10799 case PyUnicode_2BYTE_KIND:
10800 iresult = ucs2lib_count(
10801 ((Py_UCS2*)buf1) + start, end - start,
10802 buf2, len2, PY_SSIZE_T_MAX
10803 );
10804 break;
10805 case PyUnicode_4BYTE_KIND:
10806 iresult = ucs4lib_count(
10807 ((Py_UCS4*)buf1) + start, end - start,
10808 buf2, len2, PY_SSIZE_T_MAX
10809 );
10810 break;
10811 default:
10812 assert(0); iresult = 0;
10813 }
10814
10815 result = PyLong_FromSsize_t(iresult);
10816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010817 if (kind2 != kind)
10818 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819
10820 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010821
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822 return result;
10823}
10824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010825PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010826 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010828Encode S using the codec registered for encoding. Default encoding\n\
10829is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010830handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010831a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10832'xmlcharrefreplace' as well as any other name registered with\n\
10833codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834
10835static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010836unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010838 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839 char *encoding = NULL;
10840 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010841
Benjamin Peterson308d6372009-09-18 21:42:35 +000010842 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10843 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010845 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010846}
10847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010848PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010849 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850\n\
10851Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010852If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853
10854static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010855unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010856{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010857 Py_ssize_t i, j, line_pos, src_len, incr;
10858 Py_UCS4 ch;
10859 PyObject *u;
10860 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010862 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010863 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864
10865 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010866 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867
Antoine Pitrou22425222011-10-04 19:10:51 +020010868 if (PyUnicode_READY(self) == -1)
10869 return NULL;
10870
Thomas Wouters7e474022000-07-16 12:04:32 +000010871 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010872 src_len = PyUnicode_GET_LENGTH(self);
10873 i = j = line_pos = 0;
10874 kind = PyUnicode_KIND(self);
10875 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010876 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010877 for (; i < src_len; i++) {
10878 ch = PyUnicode_READ(kind, src_data, i);
10879 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010880 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010881 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010882 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010883 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010884 goto overflow;
10885 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010886 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010887 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010888 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010889 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010890 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010891 goto overflow;
10892 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010894 if (ch == '\n' || ch == '\r')
10895 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010897 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010010898 if (!found)
10899 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010900
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010902 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903 if (!u)
10904 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010905 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010906
Antoine Pitroue71d5742011-10-04 15:55:09 +020010907 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908
Antoine Pitroue71d5742011-10-04 15:55:09 +020010909 for (; i < src_len; i++) {
10910 ch = PyUnicode_READ(kind, src_data, i);
10911 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010912 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010913 incr = tabsize - (line_pos % tabsize);
10914 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010010915 FILL(kind, dest_data, ' ', j, incr);
10916 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010917 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010918 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010919 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010920 line_pos++;
10921 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010922 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010923 if (ch == '\n' || ch == '\r')
10924 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010926 }
10927 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010928 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010929
Antoine Pitroue71d5742011-10-04 15:55:09 +020010930 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010931 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10932 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933}
10934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010935PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010936 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937\n\
10938Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010939such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940arguments start and end are interpreted as in slice notation.\n\
10941\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010942Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943
10944static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010945unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010947 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010948 Py_ssize_t start;
10949 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010950 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951
Jesus Ceaac451502011-04-20 17:09:23 +020010952 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10953 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 if (PyUnicode_READY(self) == -1)
10957 return NULL;
10958 if (PyUnicode_READY(substring) == -1)
10959 return NULL;
10960
Victor Stinner7931d9a2011-11-04 00:22:48 +010010961 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962
10963 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010965 if (result == -2)
10966 return NULL;
10967
Christian Heimes217cfd12007-12-02 14:31:20 +000010968 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010969}
10970
10971static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010972unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010974 void *data;
10975 enum PyUnicode_Kind kind;
10976 Py_UCS4 ch;
10977 PyObject *res;
10978
10979 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
10980 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020010982 }
10983 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
10984 PyErr_SetString(PyExc_IndexError, "string index out of range");
10985 return NULL;
10986 }
10987 kind = PyUnicode_KIND(self);
10988 data = PyUnicode_DATA(self);
10989 ch = PyUnicode_READ(kind, data, index);
10990 if (ch < 256)
10991 return get_latin1_char(ch);
10992
10993 res = PyUnicode_New(1, ch);
10994 if (res == NULL)
10995 return NULL;
10996 kind = PyUnicode_KIND(res);
10997 data = PyUnicode_DATA(res);
10998 PyUnicode_WRITE(kind, data, 0, ch);
10999 assert(_PyUnicode_CheckConsistency(res, 1));
11000 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001}
11002
Guido van Rossumc2504932007-09-18 19:42:40 +000011003/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011004 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011005static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011006unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007{
Guido van Rossumc2504932007-09-18 19:42:40 +000011008 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011009 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011010
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011011#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011012 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011013#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 if (_PyUnicode_HASH(self) != -1)
11015 return _PyUnicode_HASH(self);
11016 if (PyUnicode_READY(self) == -1)
11017 return -1;
11018 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011019 /*
11020 We make the hash of the empty string be 0, rather than using
11021 (prefix ^ suffix), since this slightly obfuscates the hash secret
11022 */
11023 if (len == 0) {
11024 _PyUnicode_HASH(self) = 0;
11025 return 0;
11026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027
11028 /* The hash function as a macro, gets expanded three times below. */
Georg Brandl2fb477c2012-02-21 00:33:36 +010011029#define HASH(P) \
11030 x ^= (Py_uhash_t) *P << 7; \
11031 while (--len >= 0) \
11032 x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033
Georg Brandl2fb477c2012-02-21 00:33:36 +010011034 x = (Py_uhash_t) _Py_HashSecret.prefix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 switch (PyUnicode_KIND(self)) {
11036 case PyUnicode_1BYTE_KIND: {
11037 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11038 HASH(c);
11039 break;
11040 }
11041 case PyUnicode_2BYTE_KIND: {
11042 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11043 HASH(s);
11044 break;
11045 }
11046 default: {
11047 Py_UCS4 *l;
11048 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11049 "Impossible switch case in unicode_hash");
11050 l = PyUnicode_4BYTE_DATA(self);
11051 HASH(l);
11052 break;
11053 }
11054 }
Georg Brandl2fb477c2012-02-21 00:33:36 +010011055 x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11056 x ^= (Py_uhash_t) _Py_HashSecret.suffix;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011057
Guido van Rossumc2504932007-09-18 19:42:40 +000011058 if (x == -1)
11059 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011060 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011061 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011063#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011065PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011066 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011068Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069
11070static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011073 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011074 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011075 Py_ssize_t start;
11076 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077
Jesus Ceaac451502011-04-20 17:09:23 +020011078 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11079 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 if (PyUnicode_READY(self) == -1)
11083 return NULL;
11084 if (PyUnicode_READY(substring) == -1)
11085 return NULL;
11086
Victor Stinner7931d9a2011-11-04 00:22:48 +010011087 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088
11089 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011091 if (result == -2)
11092 return NULL;
11093
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094 if (result < 0) {
11095 PyErr_SetString(PyExc_ValueError, "substring not found");
11096 return NULL;
11097 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011098
Christian Heimes217cfd12007-12-02 14:31:20 +000011099 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100}
11101
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011102PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011103 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011105Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011106at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011107
11108static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011109unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011111 Py_ssize_t i, length;
11112 int kind;
11113 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011114 int cased;
11115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116 if (PyUnicode_READY(self) == -1)
11117 return NULL;
11118 length = PyUnicode_GET_LENGTH(self);
11119 kind = PyUnicode_KIND(self);
11120 data = PyUnicode_DATA(self);
11121
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011123 if (length == 1)
11124 return PyBool_FromLong(
11125 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011127 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011128 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011129 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011130
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011132 for (i = 0; i < length; i++) {
11133 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011134
Benjamin Peterson29060642009-01-31 22:14:21 +000011135 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11136 return PyBool_FromLong(0);
11137 else if (!cased && Py_UNICODE_ISLOWER(ch))
11138 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011140 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011141}
11142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011143PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011144 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011146Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011147at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148
11149static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011150unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152 Py_ssize_t i, length;
11153 int kind;
11154 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155 int cased;
11156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 if (PyUnicode_READY(self) == -1)
11158 return NULL;
11159 length = PyUnicode_GET_LENGTH(self);
11160 kind = PyUnicode_KIND(self);
11161 data = PyUnicode_DATA(self);
11162
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 if (length == 1)
11165 return PyBool_FromLong(
11166 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011168 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011170 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011171
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011173 for (i = 0; i < length; i++) {
11174 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011175
Benjamin Peterson29060642009-01-31 22:14:21 +000011176 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11177 return PyBool_FromLong(0);
11178 else if (!cased && Py_UNICODE_ISUPPER(ch))
11179 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011181 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182}
11183
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011184PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011185 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011187Return True if S is a titlecased string and there is at least one\n\
11188character in S, i.e. upper- and titlecase characters may only\n\
11189follow uncased characters and lowercase characters only cased ones.\n\
11190Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191
11192static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011193unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195 Py_ssize_t i, length;
11196 int kind;
11197 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198 int cased, previous_is_cased;
11199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 if (PyUnicode_READY(self) == -1)
11201 return NULL;
11202 length = PyUnicode_GET_LENGTH(self);
11203 kind = PyUnicode_KIND(self);
11204 data = PyUnicode_DATA(self);
11205
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207 if (length == 1) {
11208 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11209 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11210 (Py_UNICODE_ISUPPER(ch) != 0));
11211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011213 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011215 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011216
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217 cased = 0;
11218 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 for (i = 0; i < length; i++) {
11220 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011221
Benjamin Peterson29060642009-01-31 22:14:21 +000011222 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11223 if (previous_is_cased)
11224 return PyBool_FromLong(0);
11225 previous_is_cased = 1;
11226 cased = 1;
11227 }
11228 else if (Py_UNICODE_ISLOWER(ch)) {
11229 if (!previous_is_cased)
11230 return PyBool_FromLong(0);
11231 previous_is_cased = 1;
11232 cased = 1;
11233 }
11234 else
11235 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011237 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011238}
11239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011240PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011241 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011242\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011243Return True if all characters in S are whitespace\n\
11244and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011245
11246static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011247unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011249 Py_ssize_t i, length;
11250 int kind;
11251 void *data;
11252
11253 if (PyUnicode_READY(self) == -1)
11254 return NULL;
11255 length = PyUnicode_GET_LENGTH(self);
11256 kind = PyUnicode_KIND(self);
11257 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011258
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260 if (length == 1)
11261 return PyBool_FromLong(
11262 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011264 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011266 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011268 for (i = 0; i < length; i++) {
11269 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011270 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011271 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011273 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274}
11275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011276PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011277 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011278\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011279Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011280and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011281
11282static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011283unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011285 Py_ssize_t i, length;
11286 int kind;
11287 void *data;
11288
11289 if (PyUnicode_READY(self) == -1)
11290 return NULL;
11291 length = PyUnicode_GET_LENGTH(self);
11292 kind = PyUnicode_KIND(self);
11293 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011294
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011295 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011296 if (length == 1)
11297 return PyBool_FromLong(
11298 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011299
11300 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011301 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011302 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011303
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 for (i = 0; i < length; i++) {
11305 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011306 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011307 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011308 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011309}
11310
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011311PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011312 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011313\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011314Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011315and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011316
11317static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011318unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011319{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320 int kind;
11321 void *data;
11322 Py_ssize_t len, i;
11323
11324 if (PyUnicode_READY(self) == -1)
11325 return NULL;
11326
11327 kind = PyUnicode_KIND(self);
11328 data = PyUnicode_DATA(self);
11329 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011330
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011331 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011332 if (len == 1) {
11333 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11334 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11335 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011336
11337 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011339 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 for (i = 0; i < len; i++) {
11342 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011343 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011344 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011345 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011346 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011347}
11348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011349PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011350 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011352Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011353False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354
11355static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011356unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011358 Py_ssize_t i, length;
11359 int kind;
11360 void *data;
11361
11362 if (PyUnicode_READY(self) == -1)
11363 return NULL;
11364 length = PyUnicode_GET_LENGTH(self);
11365 kind = PyUnicode_KIND(self);
11366 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 if (length == 1)
11370 return PyBool_FromLong(
11371 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011373 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011375 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 for (i = 0; i < length; i++) {
11378 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011379 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011381 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011382}
11383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011384PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011385 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011386\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011387Return True if all characters in S are digits\n\
11388and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389
11390static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011391unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011392{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 Py_ssize_t i, length;
11394 int kind;
11395 void *data;
11396
11397 if (PyUnicode_READY(self) == -1)
11398 return NULL;
11399 length = PyUnicode_GET_LENGTH(self);
11400 kind = PyUnicode_KIND(self);
11401 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011404 if (length == 1) {
11405 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11406 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011409 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011411 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413 for (i = 0; i < length; i++) {
11414 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011415 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011417 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418}
11419
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011420PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011421 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011423Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011424False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
11426static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011427unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011429 Py_ssize_t i, length;
11430 int kind;
11431 void *data;
11432
11433 if (PyUnicode_READY(self) == -1)
11434 return NULL;
11435 length = PyUnicode_GET_LENGTH(self);
11436 kind = PyUnicode_KIND(self);
11437 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011440 if (length == 1)
11441 return PyBool_FromLong(
11442 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011444 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011446 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 for (i = 0; i < length; i++) {
11449 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011450 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011452 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453}
11454
Martin v. Löwis47383402007-08-15 07:32:56 +000011455int
11456PyUnicode_IsIdentifier(PyObject *self)
11457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 int kind;
11459 void *data;
11460 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011461 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 if (PyUnicode_READY(self) == -1) {
11464 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011465 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 }
11467
11468 /* Special case for empty strings */
11469 if (PyUnicode_GET_LENGTH(self) == 0)
11470 return 0;
11471 kind = PyUnicode_KIND(self);
11472 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011473
11474 /* PEP 3131 says that the first character must be in
11475 XID_Start and subsequent characters in XID_Continue,
11476 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011477 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011478 letters, digits, underscore). However, given the current
11479 definition of XID_Start and XID_Continue, it is sufficient
11480 to check just for these, except that _ must be allowed
11481 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011482 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011483 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011484 return 0;
11485
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011486 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011487 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011488 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011489 return 1;
11490}
11491
11492PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011493 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011494\n\
11495Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011496to the language definition.\n\
11497\n\
11498Use keyword.iskeyword() to test for reserved identifiers\n\
11499such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011500
11501static PyObject*
11502unicode_isidentifier(PyObject *self)
11503{
11504 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11505}
11506
Georg Brandl559e5d72008-06-11 18:37:52 +000011507PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011508 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011509\n\
11510Return True if all characters in S are considered\n\
11511printable in repr() or S is empty, False otherwise.");
11512
11513static PyObject*
11514unicode_isprintable(PyObject *self)
11515{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011516 Py_ssize_t i, length;
11517 int kind;
11518 void *data;
11519
11520 if (PyUnicode_READY(self) == -1)
11521 return NULL;
11522 length = PyUnicode_GET_LENGTH(self);
11523 kind = PyUnicode_KIND(self);
11524 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011525
11526 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011527 if (length == 1)
11528 return PyBool_FromLong(
11529 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531 for (i = 0; i < length; i++) {
11532 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011533 Py_RETURN_FALSE;
11534 }
11535 }
11536 Py_RETURN_TRUE;
11537}
11538
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011539PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011540 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541\n\
11542Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011543iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544
11545static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011546unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011548 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549}
11550
Martin v. Löwis18e16552006-02-15 17:27:45 +000011551static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011552unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 if (PyUnicode_READY(self) == -1)
11555 return -1;
11556 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557}
11558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011559PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011560 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011561\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011562Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011563done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564
11565static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011566unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011568 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 Py_UCS4 fillchar = ' ';
11570
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011571 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572 return NULL;
11573
Benjamin Petersonbac79492012-01-14 13:34:47 -050011574 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011575 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011576
Victor Stinnerc4b49542011-12-11 22:44:26 +010011577 if (PyUnicode_GET_LENGTH(self) >= width)
11578 return unicode_result_unchanged(self);
11579
11580 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581}
11582
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011583PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011584 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011586Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587
11588static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011589unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011591 if (PyUnicode_READY(self) == -1)
11592 return NULL;
11593 if (PyUnicode_IS_ASCII(self))
11594 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011595 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596}
11597
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011598#define LEFTSTRIP 0
11599#define RIGHTSTRIP 1
11600#define BOTHSTRIP 2
11601
11602/* Arrays indexed by above */
11603static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11604
11605#define STRIPNAME(i) (stripformat[i]+3)
11606
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011607/* externally visible for str.strip(unicode) */
11608PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011609_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011610{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011611 void *data;
11612 int kind;
11613 Py_ssize_t i, j, len;
11614 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11617 return NULL;
11618
11619 kind = PyUnicode_KIND(self);
11620 data = PyUnicode_DATA(self);
11621 len = PyUnicode_GET_LENGTH(self);
11622 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11623 PyUnicode_DATA(sepobj),
11624 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011625
Benjamin Peterson14339b62009-01-31 16:36:08 +000011626 i = 0;
11627 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 while (i < len &&
11629 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011630 i++;
11631 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011632 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011633
Benjamin Peterson14339b62009-01-31 16:36:08 +000011634 j = len;
11635 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011636 do {
11637 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 } while (j >= i &&
11639 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011640 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011641 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011642
Victor Stinner7931d9a2011-11-04 00:22:48 +010011643 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011644}
11645
11646PyObject*
11647PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11648{
11649 unsigned char *data;
11650 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011651 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011652
Victor Stinnerde636f32011-10-01 03:55:54 +020011653 if (PyUnicode_READY(self) == -1)
11654 return NULL;
11655
Victor Stinner684d5fd2012-05-03 02:32:34 +020011656 length = PyUnicode_GET_LENGTH(self);
11657 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020011658
Victor Stinner684d5fd2012-05-03 02:32:34 +020011659 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011660 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661
Victor Stinnerde636f32011-10-01 03:55:54 +020011662 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011663 PyErr_SetString(PyExc_IndexError, "string index out of range");
11664 return NULL;
11665 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020011666 if (start >= length || end < start)
11667 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020011668
Victor Stinner684d5fd2012-05-03 02:32:34 +020011669 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020011670 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020011671 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020011672 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020011673 }
11674 else {
11675 kind = PyUnicode_KIND(self);
11676 data = PyUnicode_1BYTE_DATA(self);
11677 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011678 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011679 length);
11680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682
11683static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011684do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686 int kind;
11687 void *data;
11688 Py_ssize_t len, i, j;
11689
11690 if (PyUnicode_READY(self) == -1)
11691 return NULL;
11692
11693 kind = PyUnicode_KIND(self);
11694 data = PyUnicode_DATA(self);
11695 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011696
Benjamin Peterson14339b62009-01-31 16:36:08 +000011697 i = 0;
11698 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011700 i++;
11701 }
11702 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011703
Benjamin Peterson14339b62009-01-31 16:36:08 +000011704 j = len;
11705 if (striptype != LEFTSTRIP) {
11706 do {
11707 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011708 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011709 j++;
11710 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011711
Victor Stinner7931d9a2011-11-04 00:22:48 +010011712 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713}
11714
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011715
11716static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011717do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011718{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011719 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011720
Benjamin Peterson14339b62009-01-31 16:36:08 +000011721 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11722 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011723
Benjamin Peterson14339b62009-01-31 16:36:08 +000011724 if (sep != NULL && sep != Py_None) {
11725 if (PyUnicode_Check(sep))
11726 return _PyUnicode_XStrip(self, striptype, sep);
11727 else {
11728 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011729 "%s arg must be None or str",
11730 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011731 return NULL;
11732 }
11733 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011734
Benjamin Peterson14339b62009-01-31 16:36:08 +000011735 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011736}
11737
11738
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011739PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011740 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011741\n\
11742Return a copy of the string S with leading and trailing\n\
11743whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011744If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011745
11746static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011747unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011748{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011749 if (PyTuple_GET_SIZE(args) == 0)
11750 return do_strip(self, BOTHSTRIP); /* Common case */
11751 else
11752 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011753}
11754
11755
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011756PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011757 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011758\n\
11759Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011760If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011761
11762static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011763unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011764{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011765 if (PyTuple_GET_SIZE(args) == 0)
11766 return do_strip(self, LEFTSTRIP); /* Common case */
11767 else
11768 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011769}
11770
11771
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011772PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011773 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011774\n\
11775Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011776If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011777
11778static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011779unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011780{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011781 if (PyTuple_GET_SIZE(args) == 0)
11782 return do_strip(self, RIGHTSTRIP); /* Common case */
11783 else
11784 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011785}
11786
11787
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011789unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011791 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011792 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793
Serhiy Storchaka05997252013-01-26 12:14:02 +020011794 if (len < 1)
11795 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796
Victor Stinnerc4b49542011-12-11 22:44:26 +010011797 /* no repeat, return original string */
11798 if (len == 1)
11799 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000011800
Benjamin Petersonbac79492012-01-14 13:34:47 -050011801 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 return NULL;
11803
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011804 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011805 PyErr_SetString(PyExc_OverflowError,
11806 "repeated string is too long");
11807 return NULL;
11808 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011810
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011811 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812 if (!u)
11813 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011814 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 if (PyUnicode_GET_LENGTH(str) == 1) {
11817 const int kind = PyUnicode_KIND(str);
11818 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010011819 if (kind == PyUnicode_1BYTE_KIND) {
11820 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011821 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010011822 }
11823 else if (kind == PyUnicode_2BYTE_KIND) {
11824 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011825 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010011826 ucs2[n] = fill_char;
11827 } else {
11828 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11829 assert(kind == PyUnicode_4BYTE_KIND);
11830 for (n = 0; n < len; ++n)
11831 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011832 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 }
11834 else {
11835 /* number of characters copied this far */
11836 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011837 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 char *to = (char *) PyUnicode_DATA(u);
11839 Py_MEMCPY(to, PyUnicode_DATA(str),
11840 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011841 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 n = (done <= nchars-done) ? done : nchars-done;
11843 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011844 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846 }
11847
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011848 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011849 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850}
11851
Alexander Belopolsky40018472011-02-26 01:02:56 +000011852PyObject *
11853PyUnicode_Replace(PyObject *obj,
11854 PyObject *subobj,
11855 PyObject *replobj,
11856 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857{
11858 PyObject *self;
11859 PyObject *str1;
11860 PyObject *str2;
11861 PyObject *result;
11862
11863 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011864 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011865 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011866 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011867 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011868 Py_DECREF(self);
11869 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870 }
11871 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011872 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011873 Py_DECREF(self);
11874 Py_DECREF(str1);
11875 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011877 if (PyUnicode_READY(self) == -1 ||
11878 PyUnicode_READY(str1) == -1 ||
11879 PyUnicode_READY(str2) == -1)
11880 result = NULL;
11881 else
11882 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883 Py_DECREF(self);
11884 Py_DECREF(str1);
11885 Py_DECREF(str2);
11886 return result;
11887}
11888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011889PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011890 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891\n\
11892Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011893old replaced by new. If the optional argument count is\n\
11894given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895
11896static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 PyObject *str1;
11900 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011901 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902 PyObject *result;
11903
Martin v. Löwis18e16552006-02-15 17:27:45 +000011904 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060011906 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011907 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011909 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910 return NULL;
11911 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060011912 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011913 Py_DECREF(str1);
11914 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011915 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060011916 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
11917 result = NULL;
11918 else
11919 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920
11921 Py_DECREF(str1);
11922 Py_DECREF(str2);
11923 return result;
11924}
11925
Alexander Belopolsky40018472011-02-26 01:02:56 +000011926static PyObject *
11927unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011929 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 Py_ssize_t isize;
11931 Py_ssize_t osize, squote, dquote, i, o;
11932 Py_UCS4 max, quote;
11933 int ikind, okind;
11934 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011937 return NULL;
11938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 isize = PyUnicode_GET_LENGTH(unicode);
11940 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 /* Compute length of output, quote characters, and
11943 maximum character */
11944 osize = 2; /* quotes */
11945 max = 127;
11946 squote = dquote = 0;
11947 ikind = PyUnicode_KIND(unicode);
11948 for (i = 0; i < isize; i++) {
11949 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11950 switch (ch) {
11951 case '\'': squote++; osize++; break;
11952 case '"': dquote++; osize++; break;
11953 case '\\': case '\t': case '\r': case '\n':
11954 osize += 2; break;
11955 default:
11956 /* Fast-path ASCII */
11957 if (ch < ' ' || ch == 0x7f)
11958 osize += 4; /* \xHH */
11959 else if (ch < 0x7f)
11960 osize++;
11961 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11962 osize++;
11963 max = ch > max ? ch : max;
11964 }
11965 else if (ch < 0x100)
11966 osize += 4; /* \xHH */
11967 else if (ch < 0x10000)
11968 osize += 6; /* \uHHHH */
11969 else
11970 osize += 10; /* \uHHHHHHHH */
11971 }
11972 }
11973
11974 quote = '\'';
11975 if (squote) {
11976 if (dquote)
11977 /* Both squote and dquote present. Use squote,
11978 and escape them */
11979 osize += squote;
11980 else
11981 quote = '"';
11982 }
11983
11984 repr = PyUnicode_New(osize, max);
11985 if (repr == NULL)
11986 return NULL;
11987 okind = PyUnicode_KIND(repr);
11988 odata = PyUnicode_DATA(repr);
11989
11990 PyUnicode_WRITE(okind, odata, 0, quote);
11991 PyUnicode_WRITE(okind, odata, osize-1, quote);
11992
11993 for (i = 0, o = 1; i < isize; i++) {
11994 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011995
11996 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 if ((ch == quote) || (ch == '\\')) {
11998 PyUnicode_WRITE(okind, odata, o++, '\\');
11999 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012000 continue;
12001 }
12002
Benjamin Peterson29060642009-01-31 22:14:21 +000012003 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012004 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 PyUnicode_WRITE(okind, odata, o++, '\\');
12006 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012007 }
12008 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 PyUnicode_WRITE(okind, odata, o++, '\\');
12010 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012011 }
12012 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 PyUnicode_WRITE(okind, odata, o++, '\\');
12014 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012015 }
12016
12017 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012018 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 PyUnicode_WRITE(okind, odata, o++, '\\');
12020 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012021 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12022 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012023 }
12024
Georg Brandl559e5d72008-06-11 18:37:52 +000012025 /* Copy ASCII characters as-is */
12026 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012028 }
12029
Benjamin Peterson29060642009-01-31 22:14:21 +000012030 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012031 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012032 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012033 (categories Z* and C* except ASCII space)
12034 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012036 PyUnicode_WRITE(okind, odata, o++, '\\');
Georg Brandl559e5d72008-06-11 18:37:52 +000012037 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 if (ch <= 0xff) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012040 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12041 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012042 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012043 /* Map 16-bit characters to '\uxxxx' */
12044 else if (ch <= 0xffff) {
12045 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012046 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12047 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12048 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12049 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012050 }
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012051 /* Map 21-bit characters to '\U00xxxxxx' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012052 else {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012053 PyUnicode_WRITE(okind, odata, o++, 'U');
12054 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12055 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12056 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12057 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
Victor Stinnerf5cff562011-10-14 02:13:11 +020012058 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12059 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12060 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12061 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012062 }
12063 }
12064 /* Copy characters as-is */
12065 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012067 }
12068 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012071 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012072 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073}
12074
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012075PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012076 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077\n\
12078Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012079such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080arguments start and end are interpreted as in slice notation.\n\
12081\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012082Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083
12084static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012087 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012088 Py_ssize_t start;
12089 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012090 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091
Jesus Ceaac451502011-04-20 17:09:23 +020012092 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12093 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012094 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 if (PyUnicode_READY(self) == -1)
12097 return NULL;
12098 if (PyUnicode_READY(substring) == -1)
12099 return NULL;
12100
Victor Stinner7931d9a2011-11-04 00:22:48 +010012101 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102
12103 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012105 if (result == -2)
12106 return NULL;
12107
Christian Heimes217cfd12007-12-02 14:31:20 +000012108 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109}
12110
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012111PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012112 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012114Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115
12116static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012119 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012120 Py_ssize_t start;
12121 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012122 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012123
Jesus Ceaac451502011-04-20 17:09:23 +020012124 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12125 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012126 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 if (PyUnicode_READY(self) == -1)
12129 return NULL;
12130 if (PyUnicode_READY(substring) == -1)
12131 return NULL;
12132
Victor Stinner7931d9a2011-11-04 00:22:48 +010012133 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134
12135 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137 if (result == -2)
12138 return NULL;
12139
Guido van Rossumd57fd912000-03-10 22:53:23 +000012140 if (result < 0) {
12141 PyErr_SetString(PyExc_ValueError, "substring not found");
12142 return NULL;
12143 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012144
Christian Heimes217cfd12007-12-02 14:31:20 +000012145 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146}
12147
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012148PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012149 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012150\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012151Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012152done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153
12154static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012155unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012157 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012158 Py_UCS4 fillchar = ' ';
12159
Victor Stinnere9a29352011-10-01 02:14:59 +020012160 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012162
Benjamin Petersonbac79492012-01-14 13:34:47 -050012163 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012164 return NULL;
12165
Victor Stinnerc4b49542011-12-11 22:44:26 +010012166 if (PyUnicode_GET_LENGTH(self) >= width)
12167 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168
Victor Stinnerc4b49542011-12-11 22:44:26 +010012169 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170}
12171
Alexander Belopolsky40018472011-02-26 01:02:56 +000012172PyObject *
12173PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174{
12175 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012176
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177 s = PyUnicode_FromObject(s);
12178 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012179 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012180 if (sep != NULL) {
12181 sep = PyUnicode_FromObject(sep);
12182 if (sep == NULL) {
12183 Py_DECREF(s);
12184 return NULL;
12185 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186 }
12187
Victor Stinner9310abb2011-10-05 00:59:23 +020012188 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189
12190 Py_DECREF(s);
12191 Py_XDECREF(sep);
12192 return result;
12193}
12194
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012195PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012196 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197\n\
12198Return a list of the words in S, using sep as the\n\
12199delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012200splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012201whitespace string is a separator and empty strings are\n\
12202removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203
12204static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012205unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012207 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012209 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012211 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12212 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213 return NULL;
12214
12215 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012216 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012218 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012220 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221}
12222
Thomas Wouters477c8d52006-05-27 19:21:47 +000012223PyObject *
12224PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12225{
12226 PyObject* str_obj;
12227 PyObject* sep_obj;
12228 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 int kind1, kind2, kind;
12230 void *buf1 = NULL, *buf2 = NULL;
12231 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012232
12233 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012234 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012235 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012236 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012237 if (!sep_obj) {
12238 Py_DECREF(str_obj);
12239 return NULL;
12240 }
12241 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12242 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012243 Py_DECREF(str_obj);
12244 return NULL;
12245 }
12246
Victor Stinner14f8f022011-10-05 20:58:25 +020012247 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012249 kind = Py_MAX(kind1, kind2);
12250 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012251 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012252 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012253 if (!buf1)
12254 goto onError;
12255 buf2 = PyUnicode_DATA(sep_obj);
12256 if (kind2 != kind)
12257 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12258 if (!buf2)
12259 goto onError;
12260 len1 = PyUnicode_GET_LENGTH(str_obj);
12261 len2 = PyUnicode_GET_LENGTH(sep_obj);
12262
Benjamin Petersonead6b532011-12-20 17:23:42 -060012263 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012264 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012265 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12266 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12267 else
12268 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012269 break;
12270 case PyUnicode_2BYTE_KIND:
12271 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12272 break;
12273 case PyUnicode_4BYTE_KIND:
12274 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12275 break;
12276 default:
12277 assert(0);
12278 out = 0;
12279 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012280
12281 Py_DECREF(sep_obj);
12282 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283 if (kind1 != kind)
12284 PyMem_Free(buf1);
12285 if (kind2 != kind)
12286 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012287
12288 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012289 onError:
12290 Py_DECREF(sep_obj);
12291 Py_DECREF(str_obj);
12292 if (kind1 != kind && buf1)
12293 PyMem_Free(buf1);
12294 if (kind2 != kind && buf2)
12295 PyMem_Free(buf2);
12296 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012297}
12298
12299
12300PyObject *
12301PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12302{
12303 PyObject* str_obj;
12304 PyObject* sep_obj;
12305 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 int kind1, kind2, kind;
12307 void *buf1 = NULL, *buf2 = NULL;
12308 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012309
12310 str_obj = PyUnicode_FromObject(str_in);
12311 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012312 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012313 sep_obj = PyUnicode_FromObject(sep_in);
12314 if (!sep_obj) {
12315 Py_DECREF(str_obj);
12316 return NULL;
12317 }
12318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319 kind1 = PyUnicode_KIND(str_in);
12320 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012321 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 buf1 = PyUnicode_DATA(str_in);
12323 if (kind1 != kind)
12324 buf1 = _PyUnicode_AsKind(str_in, kind);
12325 if (!buf1)
12326 goto onError;
12327 buf2 = PyUnicode_DATA(sep_obj);
12328 if (kind2 != kind)
12329 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12330 if (!buf2)
12331 goto onError;
12332 len1 = PyUnicode_GET_LENGTH(str_obj);
12333 len2 = PyUnicode_GET_LENGTH(sep_obj);
12334
Benjamin Petersonead6b532011-12-20 17:23:42 -060012335 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012337 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12338 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12339 else
12340 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341 break;
12342 case PyUnicode_2BYTE_KIND:
12343 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12344 break;
12345 case PyUnicode_4BYTE_KIND:
12346 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12347 break;
12348 default:
12349 assert(0);
12350 out = 0;
12351 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012352
12353 Py_DECREF(sep_obj);
12354 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 if (kind1 != kind)
12356 PyMem_Free(buf1);
12357 if (kind2 != kind)
12358 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012359
12360 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 onError:
12362 Py_DECREF(sep_obj);
12363 Py_DECREF(str_obj);
12364 if (kind1 != kind && buf1)
12365 PyMem_Free(buf1);
12366 if (kind2 != kind && buf2)
12367 PyMem_Free(buf2);
12368 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012369}
12370
12371PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012372 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012373\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012374Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012375the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012376found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012377
12378static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012379unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012380{
Victor Stinner9310abb2011-10-05 00:59:23 +020012381 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012382}
12383
12384PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012385 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012386\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012387Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012388the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012389separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012390
12391static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012392unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012393{
Victor Stinner9310abb2011-10-05 00:59:23 +020012394 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012395}
12396
Alexander Belopolsky40018472011-02-26 01:02:56 +000012397PyObject *
12398PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012399{
12400 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012401
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012402 s = PyUnicode_FromObject(s);
12403 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012404 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012405 if (sep != NULL) {
12406 sep = PyUnicode_FromObject(sep);
12407 if (sep == NULL) {
12408 Py_DECREF(s);
12409 return NULL;
12410 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012411 }
12412
Victor Stinner9310abb2011-10-05 00:59:23 +020012413 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012414
12415 Py_DECREF(s);
12416 Py_XDECREF(sep);
12417 return result;
12418}
12419
12420PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012421 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012422\n\
12423Return a list of the words in S, using sep as the\n\
12424delimiter string, starting at the end of the string and\n\
12425working to the front. If maxsplit is given, at most maxsplit\n\
12426splits are done. If sep is not specified, any whitespace string\n\
12427is a separator.");
12428
12429static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012430unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012431{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012432 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012433 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012434 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012435
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012436 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12437 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012438 return NULL;
12439
12440 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012441 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012442 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012443 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012444 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012445 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012446}
12447
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012448PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012449 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012450\n\
12451Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012452Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012453is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012454
12455static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012456unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012457{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012458 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012459 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012460
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012461 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12462 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463 return NULL;
12464
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012465 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466}
12467
12468static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012469PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012471 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472}
12473
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012474PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012475 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476\n\
12477Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012478and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012479
12480static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012481unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012482{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012483 if (PyUnicode_READY(self) == -1)
12484 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012485 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486}
12487
Georg Brandlceee0772007-11-27 23:48:05 +000012488PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012489 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012490\n\
12491Return a translation table usable for str.translate().\n\
12492If there is only one argument, it must be a dictionary mapping Unicode\n\
12493ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012494Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012495If there are two arguments, they must be strings of equal length, and\n\
12496in the resulting dictionary, each character in x will be mapped to the\n\
12497character at the same position in y. If there is a third argument, it\n\
12498must be a string, whose characters will be mapped to None in the result.");
12499
12500static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012501unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012502{
12503 PyObject *x, *y = NULL, *z = NULL;
12504 PyObject *new = NULL, *key, *value;
12505 Py_ssize_t i = 0;
12506 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012507
Georg Brandlceee0772007-11-27 23:48:05 +000012508 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12509 return NULL;
12510 new = PyDict_New();
12511 if (!new)
12512 return NULL;
12513 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 int x_kind, y_kind, z_kind;
12515 void *x_data, *y_data, *z_data;
12516
Georg Brandlceee0772007-11-27 23:48:05 +000012517 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012518 if (!PyUnicode_Check(x)) {
12519 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12520 "be a string if there is a second argument");
12521 goto err;
12522 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012523 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012524 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12525 "arguments must have equal length");
12526 goto err;
12527 }
12528 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 x_kind = PyUnicode_KIND(x);
12530 y_kind = PyUnicode_KIND(y);
12531 x_data = PyUnicode_DATA(x);
12532 y_data = PyUnicode_DATA(y);
12533 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12534 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012535 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012536 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012537 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012538 if (!value) {
12539 Py_DECREF(key);
12540 goto err;
12541 }
Georg Brandlceee0772007-11-27 23:48:05 +000012542 res = PyDict_SetItem(new, key, value);
12543 Py_DECREF(key);
12544 Py_DECREF(value);
12545 if (res < 0)
12546 goto err;
12547 }
12548 /* create entries for deleting chars in z */
12549 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 z_kind = PyUnicode_KIND(z);
12551 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012552 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012554 if (!key)
12555 goto err;
12556 res = PyDict_SetItem(new, key, Py_None);
12557 Py_DECREF(key);
12558 if (res < 0)
12559 goto err;
12560 }
12561 }
12562 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012563 int kind;
12564 void *data;
12565
Georg Brandlceee0772007-11-27 23:48:05 +000012566 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012567 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012568 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12569 "to maketrans it must be a dict");
12570 goto err;
12571 }
12572 /* copy entries into the new dict, converting string keys to int keys */
12573 while (PyDict_Next(x, &i, &key, &value)) {
12574 if (PyUnicode_Check(key)) {
12575 /* convert string keys to integer keys */
12576 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012577 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012578 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12579 "table must be of length 1");
12580 goto err;
12581 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012582 kind = PyUnicode_KIND(key);
12583 data = PyUnicode_DATA(key);
12584 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012585 if (!newkey)
12586 goto err;
12587 res = PyDict_SetItem(new, newkey, value);
12588 Py_DECREF(newkey);
12589 if (res < 0)
12590 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012591 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012592 /* just keep integer keys */
12593 if (PyDict_SetItem(new, key, value) < 0)
12594 goto err;
12595 } else {
12596 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12597 "be strings or integers");
12598 goto err;
12599 }
12600 }
12601 }
12602 return new;
12603 err:
12604 Py_DECREF(new);
12605 return NULL;
12606}
12607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012608PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012609 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610\n\
12611Return a copy of the string S, where all characters have been mapped\n\
12612through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012613Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012614Unmapped characters are left untouched. Characters mapped to None\n\
12615are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616
12617static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012618unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012620 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621}
12622
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012623PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012624 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012626Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627
12628static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012629unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012631 if (PyUnicode_READY(self) == -1)
12632 return NULL;
12633 if (PyUnicode_IS_ASCII(self))
12634 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012635 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636}
12637
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012638PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012639 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012640\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012641Pad a numeric string S with zeros on the left, to fill a field\n\
12642of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643
12644static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012645unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012647 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012648 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012649 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 int kind;
12651 void *data;
12652 Py_UCS4 chr;
12653
Martin v. Löwis18e16552006-02-15 17:27:45 +000012654 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012655 return NULL;
12656
Benjamin Petersonbac79492012-01-14 13:34:47 -050012657 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659
Victor Stinnerc4b49542011-12-11 22:44:26 +010012660 if (PyUnicode_GET_LENGTH(self) >= width)
12661 return unicode_result_unchanged(self);
12662
12663 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664
12665 u = pad(self, fill, 0, '0');
12666
Walter Dörwald068325e2002-04-15 13:36:47 +000012667 if (u == NULL)
12668 return NULL;
12669
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 kind = PyUnicode_KIND(u);
12671 data = PyUnicode_DATA(u);
12672 chr = PyUnicode_READ(kind, data, fill);
12673
12674 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 PyUnicode_WRITE(kind, data, 0, chr);
12677 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678 }
12679
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012680 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012681 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012683
12684#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012685static PyObject *
12686unicode__decimal2ascii(PyObject *self)
12687{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012689}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690#endif
12691
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012692PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012693 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012695Return True if S starts with the specified prefix, False otherwise.\n\
12696With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012697With optional end, stop comparing S at that position.\n\
12698prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699
12700static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012701unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012702 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012703{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012704 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012705 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012706 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012707 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012708 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709
Jesus Ceaac451502011-04-20 17:09:23 +020012710 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012711 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012712 if (PyTuple_Check(subobj)) {
12713 Py_ssize_t i;
12714 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012715 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012716 if (substring == NULL)
12717 return NULL;
12718 result = tailmatch(self, substring, start, end, -1);
12719 Py_DECREF(substring);
12720 if (result) {
12721 Py_RETURN_TRUE;
12722 }
12723 }
12724 /* nothing matched */
12725 Py_RETURN_FALSE;
12726 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012727 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012728 if (substring == NULL) {
12729 if (PyErr_ExceptionMatches(PyExc_TypeError))
12730 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12731 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012732 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012733 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012734 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012735 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012736 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737}
12738
12739
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012740PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012741 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012742\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012743Return True if S ends with the specified suffix, False otherwise.\n\
12744With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012745With optional end, stop comparing S at that position.\n\
12746suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747
12748static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012749unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012750 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012751{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012752 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012753 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012754 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012755 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012756 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012757
Jesus Ceaac451502011-04-20 17:09:23 +020012758 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012759 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012760 if (PyTuple_Check(subobj)) {
12761 Py_ssize_t i;
12762 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012763 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012764 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012765 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012766 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012767 result = tailmatch(self, substring, start, end, +1);
12768 Py_DECREF(substring);
12769 if (result) {
12770 Py_RETURN_TRUE;
12771 }
12772 }
12773 Py_RETURN_FALSE;
12774 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012775 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012776 if (substring == NULL) {
12777 if (PyErr_ExceptionMatches(PyExc_TypeError))
12778 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12779 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012780 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012781 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012782 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012783 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012784 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012785}
12786
Victor Stinner202fdca2012-05-07 12:47:02 +020012787Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012788_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012789{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012790 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012791 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12792 writer->data = PyUnicode_DATA(writer->buffer);
12793 writer->kind = PyUnicode_KIND(writer->buffer);
12794}
12795
Victor Stinnerd3f08822012-05-29 12:57:52 +020012796void
12797_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length)
Victor Stinner202fdca2012-05-07 12:47:02 +020012798{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012799 memset(writer, 0, sizeof(*writer));
12800#ifdef Py_DEBUG
12801 writer->kind = 5; /* invalid kind */
12802#endif
12803 writer->min_length = Py_MAX(min_length, 100);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012804 writer->overallocate = (min_length > 0);
Victor Stinner202fdca2012-05-07 12:47:02 +020012805}
12806
Victor Stinnerd3f08822012-05-29 12:57:52 +020012807int
12808_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12809 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020012810{
12811 Py_ssize_t newlen;
12812 PyObject *newbuffer;
12813
Victor Stinnerd3f08822012-05-29 12:57:52 +020012814 assert(length > 0);
12815
Victor Stinner202fdca2012-05-07 12:47:02 +020012816 if (length > PY_SSIZE_T_MAX - writer->pos) {
12817 PyErr_NoMemory();
12818 return -1;
12819 }
12820 newlen = writer->pos + length;
12821
Victor Stinnerd3f08822012-05-29 12:57:52 +020012822 if (writer->buffer == NULL) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012823 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012824 /* overallocate 25% to limit the number of resize */
12825 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12826 newlen += newlen / 4;
12827 if (newlen < writer->min_length)
12828 newlen = writer->min_length;
12829 }
12830 writer->buffer = PyUnicode_New(newlen, maxchar);
12831 if (writer->buffer == NULL)
12832 return -1;
12833 _PyUnicodeWriter_Update(writer);
12834 return 0;
12835 }
Victor Stinner202fdca2012-05-07 12:47:02 +020012836
Victor Stinnerd3f08822012-05-29 12:57:52 +020012837 if (newlen > writer->size) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012838 if (writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012839 /* overallocate 25% to limit the number of resize */
12840 if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))
12841 newlen += newlen / 4;
12842 if (newlen < writer->min_length)
12843 newlen = writer->min_length;
12844 }
12845
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012846 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020012847 /* resize + widen */
12848 newbuffer = PyUnicode_New(newlen, maxchar);
12849 if (newbuffer == NULL)
12850 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012851 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12852 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020012853 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012854 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020012855 }
12856 else {
12857 newbuffer = resize_compact(writer->buffer, newlen);
12858 if (newbuffer == NULL)
12859 return -1;
12860 }
12861 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012862 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012863 }
12864 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012865 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012866 newbuffer = PyUnicode_New(writer->size, maxchar);
12867 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020012868 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012869 _PyUnicode_FastCopyCharacters(newbuffer, 0,
12870 writer->buffer, 0, writer->pos);
12871 Py_DECREF(writer->buffer);
12872 writer->buffer = newbuffer;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012873 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012874 }
12875 return 0;
12876}
12877
Victor Stinnerd3f08822012-05-29 12:57:52 +020012878int
12879_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
12880{
12881 Py_UCS4 maxchar;
12882 Py_ssize_t len;
12883
12884 if (PyUnicode_READY(str) == -1)
12885 return -1;
12886 len = PyUnicode_GET_LENGTH(str);
12887 if (len == 0)
12888 return 0;
12889 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
12890 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012891 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012892 Py_INCREF(str);
12893 writer->buffer = str;
12894 _PyUnicodeWriter_Update(writer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012895 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020012896 writer->size = 0;
12897 writer->pos += len;
12898 return 0;
12899 }
12900 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
12901 return -1;
12902 }
12903 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
12904 str, 0, len);
12905 writer->pos += len;
12906 return 0;
12907}
12908
12909PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012910_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012911{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012912 if (writer->pos == 0) {
12913 Py_XDECREF(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020012914 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020012915 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020012916 if (writer->readonly) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020012917 assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
12918 return writer->buffer;
12919 }
12920 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
12921 PyObject *newbuffer;
12922 newbuffer = resize_compact(writer->buffer, writer->pos);
12923 if (newbuffer == NULL) {
12924 Py_DECREF(writer->buffer);
12925 return NULL;
12926 }
12927 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020012928 }
Victor Stinnerf59c28c2012-05-09 03:24:14 +020012929 assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
Victor Stinner2cb16aa2013-03-06 19:28:37 +010012930 return unicode_result_ready(writer->buffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020012931}
12932
Victor Stinnerd3f08822012-05-29 12:57:52 +020012933void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020012934_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020012935{
12936 Py_CLEAR(writer->buffer);
12937}
12938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012940
12941PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012942 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012943\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012944Return a formatted version of S, using substitutions from args and kwargs.\n\
12945The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012946
Eric Smith27bbca62010-11-04 17:06:58 +000012947PyDoc_STRVAR(format_map__doc__,
12948 "S.format_map(mapping) -> str\n\
12949\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012950Return a formatted version of S, using substitutions from mapping.\n\
12951The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012952
Eric Smith4a7d76d2008-05-30 18:10:19 +000012953static PyObject *
12954unicode__format__(PyObject* self, PyObject* args)
12955{
Victor Stinnerd3f08822012-05-29 12:57:52 +020012956 PyObject *format_spec;
12957 _PyUnicodeWriter writer;
12958 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012959
12960 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12961 return NULL;
12962
Victor Stinnerd3f08822012-05-29 12:57:52 +020012963 if (PyUnicode_READY(self) == -1)
12964 return NULL;
12965 _PyUnicodeWriter_Init(&writer, 0);
12966 ret = _PyUnicode_FormatAdvancedWriter(&writer,
12967 self, format_spec, 0,
12968 PyUnicode_GET_LENGTH(format_spec));
12969 if (ret == -1) {
12970 _PyUnicodeWriter_Dealloc(&writer);
12971 return NULL;
12972 }
12973 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000012974}
12975
Eric Smith8c663262007-08-25 02:26:07 +000012976PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012977 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012978\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012979Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012980
12981static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012982unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012983{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012984 Py_ssize_t size;
12985
12986 /* If it's a compact object, account for base structure +
12987 character data. */
12988 if (PyUnicode_IS_COMPACT_ASCII(v))
12989 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12990 else if (PyUnicode_IS_COMPACT(v))
12991 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012992 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012993 else {
12994 /* If it is a two-block object, account for base object, and
12995 for character block if present. */
12996 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012997 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012999 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013000 }
13001 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013002 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013003 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013005 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013006 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013007
13008 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013009}
13010
13011PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013012 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013013
13014static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013015unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013016{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013017 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013018 if (!copy)
13019 return NULL;
13020 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013021}
13022
Guido van Rossumd57fd912000-03-10 22:53:23 +000013023static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013024 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013025 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013026 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13027 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013028 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13029 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013030 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013031 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13032 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13033 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13034 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13035 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013036 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013037 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13038 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13039 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013040 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013041 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13042 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13043 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013044 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013045 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010013046 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013047 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013048 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13049 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13050 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13051 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13052 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13053 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13054 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13055 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13056 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13057 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13058 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13059 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13060 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13061 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013062 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013063 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013064 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013065 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013066 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013067 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000013068 {"maketrans", (PyCFunction) unicode_maketrans,
13069 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013070 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013071#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013072 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013073 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013074#endif
13075
Benjamin Peterson14339b62009-01-31 16:36:08 +000013076 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013077 {NULL, NULL}
13078};
13079
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013080static PyObject *
13081unicode_mod(PyObject *v, PyObject *w)
13082{
Brian Curtindfc80e32011-08-10 20:28:54 -050013083 if (!PyUnicode_Check(v))
13084 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013085 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013086}
13087
13088static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013089 0, /*nb_add*/
13090 0, /*nb_subtract*/
13091 0, /*nb_multiply*/
13092 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013093};
13094
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013096 (lenfunc) unicode_length, /* sq_length */
13097 PyUnicode_Concat, /* sq_concat */
13098 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13099 (ssizeargfunc) unicode_getitem, /* sq_item */
13100 0, /* sq_slice */
13101 0, /* sq_ass_item */
13102 0, /* sq_ass_slice */
13103 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104};
13105
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013106static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013107unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013108{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013109 if (PyUnicode_READY(self) == -1)
13110 return NULL;
13111
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013112 if (PyIndex_Check(item)) {
13113 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013114 if (i == -1 && PyErr_Occurred())
13115 return NULL;
13116 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013117 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013118 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013119 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013120 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013121 PyObject *result;
13122 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013123 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013124 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013126 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013127 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013128 return NULL;
13129 }
13130
13131 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013132 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013133 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013134 slicelength == PyUnicode_GET_LENGTH(self)) {
13135 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013136 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013137 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013138 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013139 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013140 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013141 src_kind = PyUnicode_KIND(self);
13142 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013143 if (!PyUnicode_IS_ASCII(self)) {
13144 kind_limit = kind_maxchar_limit(src_kind);
13145 max_char = 0;
13146 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13147 ch = PyUnicode_READ(src_kind, src_data, cur);
13148 if (ch > max_char) {
13149 max_char = ch;
13150 if (max_char >= kind_limit)
13151 break;
13152 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013153 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013154 }
Victor Stinner55c99112011-10-13 01:17:06 +020013155 else
13156 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013157 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013158 if (result == NULL)
13159 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013160 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013161 dest_data = PyUnicode_DATA(result);
13162
13163 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013164 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13165 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013166 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013167 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013168 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013169 } else {
13170 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13171 return NULL;
13172 }
13173}
13174
13175static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013176 (lenfunc)unicode_length, /* mp_length */
13177 (binaryfunc)unicode_subscript, /* mp_subscript */
13178 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013179};
13180
Guido van Rossumd57fd912000-03-10 22:53:23 +000013181
Guido van Rossumd57fd912000-03-10 22:53:23 +000013182/* Helpers for PyUnicode_Format() */
13183
13184static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013185getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013186{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013187 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013188 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013189 (*p_argidx)++;
13190 if (arglen < 0)
13191 return args;
13192 else
13193 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013194 }
13195 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013196 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013197 return NULL;
13198}
13199
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013200/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013201
Victor Stinnerd3f08822012-05-29 12:57:52 +020013202static int
13203formatfloat(PyObject *v, int flags, int prec, int type,
13204 PyObject **p_output, _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013205{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013206 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013208 Py_ssize_t len;
Tim Petersced69f82003-09-16 20:30:58 +000013209
Guido van Rossumd57fd912000-03-10 22:53:23 +000013210 x = PyFloat_AsDouble(v);
13211 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013212 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013213
Guido van Rossumd57fd912000-03-10 22:53:23 +000013214 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013215 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013216
Eric Smith0923d1d2009-04-16 20:16:10 +000013217 p = PyOS_double_to_string(x, type, prec,
13218 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013219 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013220 return -1;
13221 len = strlen(p);
13222 if (writer) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013223 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13224 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013225 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013226 }
Victor Stinner184252a2012-06-16 02:57:41 +020013227 unicode_write_cstr(writer->buffer, writer->pos, p, len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013228 writer->pos += len;
13229 }
13230 else
13231 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013232 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013233 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013234}
13235
Victor Stinnerd0880d52012-04-27 23:40:13 +020013236/* formatlong() emulates the format codes d, u, o, x and X, and
13237 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13238 * Python's regular ints.
13239 * Return value: a new PyUnicodeObject*, or NULL if error.
13240 * The output string is of the form
13241 * "-"? ("0x" | "0X")? digit+
13242 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13243 * set in flags. The case of hex digits will be correct,
13244 * There will be at least prec digits, zero-filled on the left if
13245 * necessary to get that many.
13246 * val object to be converted
13247 * flags bitmask of format flags; only F_ALT is looked at
13248 * prec minimum number of digits; 0-fill on left if needed
13249 * type a character in [duoxX]; u acts the same as d
13250 *
13251 * CAUTION: o, x and X conversions on regular ints can never
13252 * produce a '-' sign, but can for Python's unbounded ints.
13253 */
Tim Peters38fd5b62000-09-21 05:43:11 +000013254static PyObject*
13255formatlong(PyObject *val, int flags, int prec, int type)
13256{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013257 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013258 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013259 Py_ssize_t i;
13260 int sign; /* 1 if '-', else 0 */
13261 int len; /* number of characters */
13262 Py_ssize_t llen;
13263 int numdigits; /* len == numnondigits + numdigits */
13264 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013265
Victor Stinnerd0880d52012-04-27 23:40:13 +020013266 /* Avoid exceeding SSIZE_T_MAX */
13267 if (prec > INT_MAX-3) {
13268 PyErr_SetString(PyExc_OverflowError,
13269 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013270 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013271 }
13272
13273 assert(PyLong_Check(val));
13274
13275 switch (type) {
13276 case 'd':
13277 case 'u':
13278 /* Special-case boolean: we want 0/1 */
Victor Stinnerb11d91d2012-04-28 00:25:34 +020013279 if (PyBool_Check(val))
13280 result = PyNumber_ToBase(val, 10);
13281 else
13282 result = Py_TYPE(val)->tp_str(val);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013283 break;
13284 case 'o':
13285 numnondigits = 2;
13286 result = PyNumber_ToBase(val, 8);
13287 break;
13288 case 'x':
13289 case 'X':
13290 numnondigits = 2;
13291 result = PyNumber_ToBase(val, 16);
13292 break;
13293 default:
13294 assert(!"'type' not in [duoxX]");
13295 }
13296 if (!result)
13297 return NULL;
13298
13299 assert(unicode_modifiable(result));
13300 assert(PyUnicode_IS_READY(result));
13301 assert(PyUnicode_IS_ASCII(result));
13302
13303 /* To modify the string in-place, there can only be one reference. */
13304 if (Py_REFCNT(result) != 1) {
13305 PyErr_BadInternalCall();
13306 return NULL;
13307 }
13308 buf = PyUnicode_DATA(result);
13309 llen = PyUnicode_GET_LENGTH(result);
13310 if (llen > INT_MAX) {
13311 PyErr_SetString(PyExc_ValueError,
13312 "string too large in _PyBytes_FormatLong");
13313 return NULL;
13314 }
13315 len = (int)llen;
13316 sign = buf[0] == '-';
13317 numnondigits += sign;
13318 numdigits = len - numnondigits;
13319 assert(numdigits > 0);
13320
13321 /* Get rid of base marker unless F_ALT */
13322 if (((flags & F_ALT) == 0 &&
13323 (type == 'o' || type == 'x' || type == 'X'))) {
13324 assert(buf[sign] == '0');
13325 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13326 buf[sign+1] == 'o');
13327 numnondigits -= 2;
13328 buf += 2;
13329 len -= 2;
13330 if (sign)
13331 buf[0] = '-';
13332 assert(len == numnondigits + numdigits);
13333 assert(numdigits > 0);
13334 }
13335
13336 /* Fill with leading zeroes to meet minimum width. */
13337 if (prec > numdigits) {
13338 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13339 numnondigits + prec);
13340 char *b1;
13341 if (!r1) {
13342 Py_DECREF(result);
13343 return NULL;
13344 }
13345 b1 = PyBytes_AS_STRING(r1);
13346 for (i = 0; i < numnondigits; ++i)
13347 *b1++ = *buf++;
13348 for (i = 0; i < prec - numdigits; i++)
13349 *b1++ = '0';
13350 for (i = 0; i < numdigits; i++)
13351 *b1++ = *buf++;
13352 *b1 = '\0';
13353 Py_DECREF(result);
13354 result = r1;
13355 buf = PyBytes_AS_STRING(result);
13356 len = numnondigits + prec;
13357 }
13358
13359 /* Fix up case for hex conversions. */
13360 if (type == 'X') {
13361 /* Need to convert all lower case letters to upper case.
13362 and need to convert 0x to 0X (and -0x to -0X). */
13363 for (i = 0; i < len; i++)
13364 if (buf[i] >= 'a' && buf[i] <= 'x')
13365 buf[i] -= 'a'-'A';
13366 }
13367 if (!PyUnicode_Check(result) || len != PyUnicode_GET_LENGTH(result)) {
13368 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013369 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013370 Py_DECREF(result);
13371 result = unicode;
13372 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013373 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013374}
13375
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013376static Py_UCS4
13377formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013378{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013379 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013380 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013381 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013382 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013383 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013384 goto onError;
13385 }
13386 else {
13387 /* Integer input truncated to a character */
13388 long x;
13389 x = PyLong_AsLong(v);
13390 if (x == -1 && PyErr_Occurred())
13391 goto onError;
13392
Victor Stinner8faf8212011-12-08 22:14:11 +010013393 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013394 PyErr_SetString(PyExc_OverflowError,
13395 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013396 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013397 }
13398
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013399 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013400 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013401
Benjamin Peterson29060642009-01-31 22:14:21 +000013402 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013403 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013404 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013405 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013406}
13407
Alexander Belopolsky40018472011-02-26 01:02:56 +000013408PyObject *
13409PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013410{
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013411 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013412 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013413 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013414 PyObject *temp = NULL;
13415 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013416 PyObject *uformat;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013417 void *fmt;
13418 enum PyUnicode_Kind kind, fmtkind;
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013419 _PyUnicodeWriter writer;
Victor Stinneree4544c2012-05-09 22:24:08 +020013420 Py_ssize_t sublen;
13421 Py_UCS4 maxchar;
Tim Petersced69f82003-09-16 20:30:58 +000013422
Guido van Rossumd57fd912000-03-10 22:53:23 +000013423 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013424 PyErr_BadInternalCall();
13425 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013427 uformat = PyUnicode_FromObject(format);
Benjamin Peterson22a29702012-01-02 09:00:30 -060013428 if (uformat == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013429 return NULL;
Victor Stinner19294072012-10-05 00:09:33 +020013430 if (PyUnicode_READY(uformat) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013431 Py_DECREF(uformat);
Victor Stinner19294072012-10-05 00:09:33 +020013432 return NULL;
13433 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013435 fmt = PyUnicode_DATA(uformat);
13436 fmtkind = PyUnicode_KIND(uformat);
13437 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13438 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013439
Victor Stinnerd3f08822012-05-29 12:57:52 +020013440 _PyUnicodeWriter_Init(&writer, fmtcnt + 100);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013441
Guido van Rossumd57fd912000-03-10 22:53:23 +000013442 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013443 arglen = PyTuple_Size(args);
13444 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013445 }
13446 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013447 arglen = -1;
13448 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013449 }
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040013450 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013451 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013452
13453 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013454 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013455 Py_ssize_t nonfmtpos;
13456 nonfmtpos = fmtpos++;
13457 while (fmtcnt >= 0 &&
13458 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13459 fmtpos++;
13460 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013461 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013462 if (fmtcnt < 0)
13463 fmtpos--;
Victor Stinneree4544c2012-05-09 22:24:08 +020013464 sublen = fmtpos - nonfmtpos;
13465 maxchar = _PyUnicode_FindMaxChar(uformat,
13466 nonfmtpos, nonfmtpos + sublen);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013467 if (_PyUnicodeWriter_Prepare(&writer, sublen, maxchar) == -1)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013468 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013469
Victor Stinnerd3f08822012-05-29 12:57:52 +020013470 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13471 uformat, nonfmtpos, sublen);
Victor Stinneree4544c2012-05-09 22:24:08 +020013472 writer.pos += sublen;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013473 }
13474 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013475 /* Got a format specifier */
13476 int flags = 0;
13477 Py_ssize_t width = -1;
13478 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013479 Py_UCS4 c = '\0';
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013480 Py_UCS4 fill;
13481 int sign;
13482 Py_UCS4 signchar;
Benjamin Peterson29060642009-01-31 22:14:21 +000013483 int isnumok;
13484 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013485 void *pbuf = NULL;
13486 Py_ssize_t pindex, len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013487 Py_UCS4 bufmaxchar;
13488 Py_ssize_t buflen;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013490 fmtpos++;
Victor Stinner438106b2012-05-02 00:41:57 +020013491 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13492 if (c == '(') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013493 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013494 Py_ssize_t keylen;
13495 PyObject *key;
13496 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013497
Benjamin Peterson29060642009-01-31 22:14:21 +000013498 if (dict == NULL) {
13499 PyErr_SetString(PyExc_TypeError,
13500 "format requires a mapping");
13501 goto onError;
13502 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013503 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013504 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013505 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013506 /* Skip over balanced parentheses */
13507 while (pcount > 0 && --fmtcnt >= 0) {
Victor Stinnerbff7c962012-05-03 01:44:59 +020013508 c = PyUnicode_READ(fmtkind, fmt, fmtpos);
13509 if (c == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013510 --pcount;
Victor Stinnerbff7c962012-05-03 01:44:59 +020013511 else if (c == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013512 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013513 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013514 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013515 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013516 if (fmtcnt < 0 || pcount > 0) {
13517 PyErr_SetString(PyExc_ValueError,
13518 "incomplete format key");
13519 goto onError;
13520 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013521 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013522 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013523 if (key == NULL)
13524 goto onError;
13525 if (args_owned) {
13526 Py_DECREF(args);
13527 args_owned = 0;
13528 }
13529 args = PyObject_GetItem(dict, key);
13530 Py_DECREF(key);
13531 if (args == NULL) {
13532 goto onError;
13533 }
13534 args_owned = 1;
13535 arglen = -1;
13536 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013537 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013538 while (--fmtcnt >= 0) {
Victor Stinner438106b2012-05-02 00:41:57 +020013539 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
13540 switch (c) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013541 case '-': flags |= F_LJUST; continue;
13542 case '+': flags |= F_SIGN; continue;
13543 case ' ': flags |= F_BLANK; continue;
13544 case '#': flags |= F_ALT; continue;
13545 case '0': flags |= F_ZERO; continue;
13546 }
13547 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013548 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013549 if (c == '*') {
13550 v = getnextarg(args, arglen, &argidx);
13551 if (v == NULL)
13552 goto onError;
13553 if (!PyLong_Check(v)) {
13554 PyErr_SetString(PyExc_TypeError,
13555 "* wants int");
13556 goto onError;
13557 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +020013558 width = PyLong_AsSsize_t(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013559 if (width == -1 && PyErr_Occurred())
13560 goto onError;
13561 if (width < 0) {
13562 flags |= F_LJUST;
13563 width = -width;
13564 }
13565 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013566 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013567 }
13568 else if (c >= '0' && c <= '9') {
13569 width = c - '0';
13570 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013571 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013572 if (c < '0' || c > '9')
13573 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013574 /* Since c is unsigned, the RHS would end up as unsigned,
13575 mixing signed and unsigned comparison. Since c is between
13576 '0' and '9', casting to int is safe. */
13577 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013578 PyErr_SetString(PyExc_ValueError,
13579 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013580 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013581 }
13582 width = width*10 + (c - '0');
13583 }
13584 }
13585 if (c == '.') {
13586 prec = 0;
13587 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013588 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013589 if (c == '*') {
13590 v = getnextarg(args, arglen, &argidx);
13591 if (v == NULL)
13592 goto onError;
13593 if (!PyLong_Check(v)) {
13594 PyErr_SetString(PyExc_TypeError,
13595 "* wants int");
13596 goto onError;
13597 }
Serhiy Storchaka441d30f2013-01-19 12:26:26 +020013598 prec = _PyLong_AsInt(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013599 if (prec == -1 && PyErr_Occurred())
13600 goto onError;
13601 if (prec < 0)
13602 prec = 0;
13603 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013604 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013605 }
13606 else if (c >= '0' && c <= '9') {
13607 prec = c - '0';
13608 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013609 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013610 if (c < '0' || c > '9')
13611 break;
Martin v. Löwisb05c0732012-05-15 13:45:49 +020013612 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013613 PyErr_SetString(PyExc_ValueError,
13614 "prec too big");
13615 goto onError;
13616 }
13617 prec = prec*10 + (c - '0');
13618 }
13619 }
13620 } /* prec */
13621 if (fmtcnt >= 0) {
13622 if (c == 'h' || c == 'l' || c == 'L') {
13623 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013624 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013625 }
13626 }
13627 if (fmtcnt < 0) {
13628 PyErr_SetString(PyExc_ValueError,
13629 "incomplete format");
13630 goto onError;
13631 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013632 if (fmtcnt == 0)
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013633 writer.overallocate = 0;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013634
13635 if (c == '%') {
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013636 if (_PyUnicodeWriter_Prepare(&writer, 1, '%') == -1)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013637 goto onError;
Victor Stinneree4544c2012-05-09 22:24:08 +020013638 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '%');
13639 writer.pos += 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013640 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013641 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013642
Victor Stinneraff3cc62012-04-30 05:19:21 +020013643 v = getnextarg(args, arglen, &argidx);
13644 if (v == NULL)
13645 goto onError;
13646
Benjamin Peterson29060642009-01-31 22:14:21 +000013647 sign = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013648 signchar = '\0';
Benjamin Peterson29060642009-01-31 22:14:21 +000013649 fill = ' ';
13650 switch (c) {
13651
Benjamin Peterson29060642009-01-31 22:14:21 +000013652 case 's':
13653 case 'r':
13654 case 'a':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013655 if (PyLong_CheckExact(v) && width == -1 && prec == -1) {
13656 /* Fast path */
13657 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13658 goto onError;
13659 goto nextarg;
13660 }
13661
Victor Stinner808fc0a2010-03-22 12:50:40 +000013662 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013663 temp = v;
13664 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013665 }
13666 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013667 if (c == 's')
13668 temp = PyObject_Str(v);
13669 else if (c == 'r')
13670 temp = PyObject_Repr(v);
13671 else
13672 temp = PyObject_ASCII(v);
Benjamin Peterson29060642009-01-31 22:14:21 +000013673 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013674 break;
13675
13676 case 'i':
13677 case 'd':
13678 case 'u':
13679 case 'o':
13680 case 'x':
13681 case 'X':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013682 if (PyLong_CheckExact(v)
13683 && width == -1 && prec == -1
13684 && !(flags & (F_SIGN | F_BLANK)))
13685 {
13686 /* Fast path */
13687 switch(c)
13688 {
13689 case 'd':
13690 case 'i':
13691 case 'u':
13692 if (_PyLong_FormatWriter(&writer, v, 10, flags & F_ALT) == -1)
13693 goto onError;
13694 goto nextarg;
13695 case 'x':
13696 if (_PyLong_FormatWriter(&writer, v, 16, flags & F_ALT) == -1)
13697 goto onError;
13698 goto nextarg;
13699 case 'o':
13700 if (_PyLong_FormatWriter(&writer, v, 8, flags & F_ALT) == -1)
13701 goto onError;
13702 goto nextarg;
13703 default:
13704 break;
13705 }
13706 }
13707
Benjamin Peterson29060642009-01-31 22:14:21 +000013708 isnumok = 0;
13709 if (PyNumber_Check(v)) {
13710 PyObject *iobj=NULL;
13711
13712 if (PyLong_Check(v)) {
13713 iobj = v;
13714 Py_INCREF(iobj);
13715 }
13716 else {
13717 iobj = PyNumber_Long(v);
13718 }
13719 if (iobj!=NULL) {
13720 if (PyLong_Check(iobj)) {
13721 isnumok = 1;
Victor Stinneraff3cc62012-04-30 05:19:21 +020013722 sign = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013723 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013724 Py_DECREF(iobj);
Benjamin Peterson29060642009-01-31 22:14:21 +000013725 }
13726 else {
13727 Py_DECREF(iobj);
13728 }
13729 }
13730 }
13731 if (!isnumok) {
13732 PyErr_Format(PyExc_TypeError,
13733 "%%%c format: a number is required, "
13734 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13735 goto onError;
13736 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013737 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013738 fill = '0';
13739 break;
13740
13741 case 'e':
13742 case 'E':
13743 case 'f':
13744 case 'F':
13745 case 'g':
13746 case 'G':
Victor Stinnerd3f08822012-05-29 12:57:52 +020013747 if (width == -1 && prec == -1
13748 && !(flags & (F_SIGN | F_BLANK)))
13749 {
13750 /* Fast path */
13751 if (formatfloat(v, flags, prec, c, NULL, &writer) == -1)
13752 goto onError;
13753 goto nextarg;
13754 }
13755
Benjamin Peterson29060642009-01-31 22:14:21 +000013756 sign = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013757 if (flags & F_ZERO)
Benjamin Peterson29060642009-01-31 22:14:21 +000013758 fill = '0';
Victor Stinnerd3f08822012-05-29 12:57:52 +020013759 if (formatfloat(v, flags, prec, c, &temp, NULL) == -1)
13760 temp = NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000013761 break;
13762
13763 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013764 {
13765 Py_UCS4 ch = formatchar(v);
13766 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013767 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013768 if (width == -1 && prec == -1) {
13769 /* Fast path */
13770 if (_PyUnicodeWriter_Prepare(&writer, 1, ch) == -1)
13771 goto onError;
13772 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, ch);
13773 writer.pos += 1;
13774 goto nextarg;
13775 }
Victor Stinnerb5c3ea32012-05-02 00:29:36 +020013776 temp = PyUnicode_FromOrdinal(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +000013777 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013778 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013779
13780 default:
13781 PyErr_Format(PyExc_ValueError,
13782 "unsupported format character '%c' (0x%x) "
13783 "at index %zd",
13784 (31<=c && c<=126) ? (char)c : '?',
13785 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013786 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013787 goto onError;
13788 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020013789 if (temp == NULL)
13790 goto onError;
13791 assert (PyUnicode_Check(temp));
Victor Stinnerd3f08822012-05-29 12:57:52 +020013792
13793 if (width == -1 && prec == -1
13794 && !(flags & (F_SIGN | F_BLANK)))
13795 {
13796 /* Fast path */
13797 if (_PyUnicodeWriter_WriteStr(&writer, temp) == -1)
13798 goto onError;
13799 goto nextarg;
13800 }
13801
Victor Stinneraff3cc62012-04-30 05:19:21 +020013802 if (PyUnicode_READY(temp) == -1) {
13803 Py_CLEAR(temp);
13804 goto onError;
13805 }
13806 kind = PyUnicode_KIND(temp);
13807 pbuf = PyUnicode_DATA(temp);
13808 len = PyUnicode_GET_LENGTH(temp);
13809
13810 if (c == 's' || c == 'r' || c == 'a') {
13811 if (prec >= 0 && len > prec)
13812 len = prec;
13813 }
13814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013815 /* pbuf is initialized here. */
13816 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013817 if (sign) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013818 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
13819 if (ch == '-' || ch == '+') {
13820 signchar = ch;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013821 len--;
13822 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013823 }
13824 else if (flags & F_SIGN)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013825 signchar = '+';
Benjamin Peterson29060642009-01-31 22:14:21 +000013826 else if (flags & F_BLANK)
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013827 signchar = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +000013828 else
13829 sign = 0;
13830 }
13831 if (width < len)
13832 width = len;
Victor Stinneree4544c2012-05-09 22:24:08 +020013833
13834 /* Compute the length and maximum character of the
13835 written characters */
13836 bufmaxchar = 127;
13837 if (!(flags & F_LJUST)) {
13838 if (sign) {
13839 if ((width-1) > len)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013840 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013841 }
13842 else {
13843 if (width > len)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013844 bufmaxchar = MAX_MAXCHAR(bufmaxchar, fill);
Victor Stinneree4544c2012-05-09 22:24:08 +020013845 }
13846 }
13847 maxchar = _PyUnicode_FindMaxChar(temp, 0, pindex+len);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013848 bufmaxchar = MAX_MAXCHAR(bufmaxchar, maxchar);
Victor Stinneree4544c2012-05-09 22:24:08 +020013849
13850 buflen = width;
13851 if (sign && len == width)
13852 buflen++;
13853
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013854 if (_PyUnicodeWriter_Prepare(&writer, buflen, bufmaxchar) == -1)
Victor Stinneree4544c2012-05-09 22:24:08 +020013855 goto onError;
13856
13857 /* Write characters */
Benjamin Peterson29060642009-01-31 22:14:21 +000013858 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013859 if (fill != ' ') {
Victor Stinneree4544c2012-05-09 22:24:08 +020013860 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13861 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013862 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013863 if (width > len)
13864 width--;
13865 }
13866 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013867 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013868 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013869 if (fill != ' ') {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013870 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13871 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13872 writer.pos += 2;
13873 pindex += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +000013874 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013875 width -= 2;
13876 if (width < 0)
13877 width = 0;
13878 len -= 2;
13879 }
13880 if (width > len && !(flags & F_LJUST)) {
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013881 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013882 FILL(writer.kind, writer.data, fill, writer.pos, sublen);
13883 writer.pos += sublen;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013884 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013885 }
13886 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013887 if (sign) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013888 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, signchar);
13889 writer.pos += 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013890 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013891 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013892 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13893 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013894 PyUnicode_WRITE(writer.kind, writer.data, writer.pos, '0');
13895 PyUnicode_WRITE(writer.kind, writer.data, writer.pos+1, c);
13896 writer.pos += 2;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013897 pindex += 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013898 }
13899 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013900
Victor Stinnerc9d369f2012-06-16 02:22:37 +020013901 if (len) {
13902 _PyUnicode_FastCopyCharacters(writer.buffer, writer.pos,
13903 temp, pindex, len);
13904 writer.pos += len;
13905 }
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013906 if (width > len) {
Victor Stinneree4544c2012-05-09 22:24:08 +020013907 sublen = width - len;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020013908 FILL(writer.kind, writer.data, ' ', writer.pos, sublen);
13909 writer.pos += sublen;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013910 }
Victor Stinneree4544c2012-05-09 22:24:08 +020013911
Victor Stinnerd3f08822012-05-29 12:57:52 +020013912nextarg:
Benjamin Peterson29060642009-01-31 22:14:21 +000013913 if (dict && (argidx < arglen) && c != '%') {
13914 PyErr_SetString(PyExc_TypeError,
13915 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013916 goto onError;
13917 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013918 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013919 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013920 } /* until end */
13921 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013922 PyErr_SetString(PyExc_TypeError,
13923 "not all arguments converted during string formatting");
13924 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013925 }
13926
13927 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013928 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013929 }
13930 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013931 Py_XDECREF(temp);
13932 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013933 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013934
Benjamin Peterson29060642009-01-31 22:14:21 +000013935 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013936 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013937 Py_XDECREF(temp);
13938 Py_XDECREF(second);
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013939 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013940 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013941 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013942 }
13943 return NULL;
13944}
13945
Jeremy Hylton938ace62002-07-17 16:30:39 +000013946static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013947unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13948
Tim Peters6d6c1a32001-08-02 04:15:00 +000013949static PyObject *
13950unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13951{
Benjamin Peterson29060642009-01-31 22:14:21 +000013952 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013953 static char *kwlist[] = {"object", "encoding", "errors", 0};
13954 char *encoding = NULL;
13955 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013956
Benjamin Peterson14339b62009-01-31 16:36:08 +000013957 if (type != &PyUnicode_Type)
13958 return unicode_subtype_new(type, args, kwds);
13959 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013960 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013961 return NULL;
13962 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020013963 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000013964 if (encoding == NULL && errors == NULL)
13965 return PyObject_Str(x);
13966 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013967 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013968}
13969
Guido van Rossume023fe02001-08-30 03:12:59 +000013970static PyObject *
13971unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13972{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013973 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013974 Py_ssize_t length, char_size;
13975 int share_wstr, share_utf8;
13976 unsigned int kind;
13977 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013978
Benjamin Peterson14339b62009-01-31 16:36:08 +000013979 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013980
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013981 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013982 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013983 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013984 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050013985 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060013986 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013987 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060013988 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013989
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013990 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013991 if (self == NULL) {
13992 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013993 return NULL;
13994 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013995 kind = PyUnicode_KIND(unicode);
13996 length = PyUnicode_GET_LENGTH(unicode);
13997
13998 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013999#ifdef Py_DEBUG
14000 _PyUnicode_HASH(self) = -1;
14001#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014002 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014003#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014004 _PyUnicode_STATE(self).interned = 0;
14005 _PyUnicode_STATE(self).kind = kind;
14006 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014007 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014008 _PyUnicode_STATE(self).ready = 1;
14009 _PyUnicode_WSTR(self) = NULL;
14010 _PyUnicode_UTF8_LENGTH(self) = 0;
14011 _PyUnicode_UTF8(self) = NULL;
14012 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014013 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014014
14015 share_utf8 = 0;
14016 share_wstr = 0;
14017 if (kind == PyUnicode_1BYTE_KIND) {
14018 char_size = 1;
14019 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14020 share_utf8 = 1;
14021 }
14022 else if (kind == PyUnicode_2BYTE_KIND) {
14023 char_size = 2;
14024 if (sizeof(wchar_t) == 2)
14025 share_wstr = 1;
14026 }
14027 else {
14028 assert(kind == PyUnicode_4BYTE_KIND);
14029 char_size = 4;
14030 if (sizeof(wchar_t) == 4)
14031 share_wstr = 1;
14032 }
14033
14034 /* Ensure we won't overflow the length. */
14035 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14036 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014037 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014038 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014039 data = PyObject_MALLOC((length + 1) * char_size);
14040 if (data == NULL) {
14041 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014042 goto onError;
14043 }
14044
Victor Stinnerc3c74152011-10-02 20:39:55 +020014045 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014046 if (share_utf8) {
14047 _PyUnicode_UTF8_LENGTH(self) = length;
14048 _PyUnicode_UTF8(self) = data;
14049 }
14050 if (share_wstr) {
14051 _PyUnicode_WSTR_LENGTH(self) = length;
14052 _PyUnicode_WSTR(self) = (wchar_t *)data;
14053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014054
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014055 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014056 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014057 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014058#ifdef Py_DEBUG
14059 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14060#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014061 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014062 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014063
14064onError:
14065 Py_DECREF(unicode);
14066 Py_DECREF(self);
14067 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014068}
14069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014070PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014071"str(object='') -> str\n\
14072str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014073\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014074Create a new string object from the given object. If encoding or\n\
14075errors is specified, then the object must expose a data buffer\n\
14076that will be decoded using the given encoding and error handler.\n\
14077Otherwise, returns the result of object.__str__() (if defined)\n\
14078or repr(object).\n\
14079encoding defaults to sys.getdefaultencoding().\n\
14080errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014081
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014082static PyObject *unicode_iter(PyObject *seq);
14083
Guido van Rossumd57fd912000-03-10 22:53:23 +000014084PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014085 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014086 "str", /* tp_name */
14087 sizeof(PyUnicodeObject), /* tp_size */
14088 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014089 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014090 (destructor)unicode_dealloc, /* tp_dealloc */
14091 0, /* tp_print */
14092 0, /* tp_getattr */
14093 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014094 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014095 unicode_repr, /* tp_repr */
14096 &unicode_as_number, /* tp_as_number */
14097 &unicode_as_sequence, /* tp_as_sequence */
14098 &unicode_as_mapping, /* tp_as_mapping */
14099 (hashfunc) unicode_hash, /* tp_hash*/
14100 0, /* tp_call*/
14101 (reprfunc) unicode_str, /* tp_str */
14102 PyObject_GenericGetAttr, /* tp_getattro */
14103 0, /* tp_setattro */
14104 0, /* tp_as_buffer */
14105 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014106 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014107 unicode_doc, /* tp_doc */
14108 0, /* tp_traverse */
14109 0, /* tp_clear */
14110 PyUnicode_RichCompare, /* tp_richcompare */
14111 0, /* tp_weaklistoffset */
14112 unicode_iter, /* tp_iter */
14113 0, /* tp_iternext */
14114 unicode_methods, /* tp_methods */
14115 0, /* tp_members */
14116 0, /* tp_getset */
14117 &PyBaseObject_Type, /* tp_base */
14118 0, /* tp_dict */
14119 0, /* tp_descr_get */
14120 0, /* tp_descr_set */
14121 0, /* tp_dictoffset */
14122 0, /* tp_init */
14123 0, /* tp_alloc */
14124 unicode_new, /* tp_new */
14125 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014126};
14127
14128/* Initialize the Unicode implementation */
14129
Victor Stinner3a50e702011-10-18 21:21:00 +020014130int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014131{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014132 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014133 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014134 0x000A, /* LINE FEED */
14135 0x000D, /* CARRIAGE RETURN */
14136 0x001C, /* FILE SEPARATOR */
14137 0x001D, /* GROUP SEPARATOR */
14138 0x001E, /* RECORD SEPARATOR */
14139 0x0085, /* NEXT LINE */
14140 0x2028, /* LINE SEPARATOR */
14141 0x2029, /* PARAGRAPH SEPARATOR */
14142 };
14143
Fred Drakee4315f52000-05-09 19:53:39 +000014144 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014145 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014146 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014147 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014148 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014149
Guido van Rossumcacfc072002-05-24 19:01:59 +000014150 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014151 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014152
14153 /* initialize the linebreak bloom filter */
14154 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014155 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014156 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014157
14158 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020014159
Benjamin Petersonc4311282012-10-30 23:21:10 -040014160 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14161 Py_FatalError("Can't initialize field name iterator type");
14162
14163 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14164 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014165
Victor Stinner3a50e702011-10-18 21:21:00 +020014166#ifdef HAVE_MBCS
14167 winver.dwOSVersionInfoSize = sizeof(winver);
14168 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14169 PyErr_SetFromWindowsErr(0);
14170 return -1;
14171 }
14172#endif
14173 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014174}
14175
14176/* Finalize the Unicode implementation */
14177
Christian Heimesa156e092008-02-16 07:38:31 +000014178int
14179PyUnicode_ClearFreeList(void)
14180{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014181 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014182}
14183
Guido van Rossumd57fd912000-03-10 22:53:23 +000014184void
Thomas Wouters78890102000-07-22 19:25:51 +000014185_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014186{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014187 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014188
Serhiy Storchaka05997252013-01-26 12:14:02 +020014189 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014190
Serhiy Storchaka05997252013-01-26 12:14:02 +020014191 for (i = 0; i < 256; i++)
14192 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014193 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014194 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014195}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014196
Walter Dörwald16807132007-05-25 13:52:07 +000014197void
14198PyUnicode_InternInPlace(PyObject **p)
14199{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014200 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014201 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014202#ifdef Py_DEBUG
14203 assert(s != NULL);
14204 assert(_PyUnicode_CHECK(s));
14205#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014206 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014207 return;
14208#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014209 /* If it's a subclass, we don't really know what putting
14210 it in the interned dict might do. */
14211 if (!PyUnicode_CheckExact(s))
14212 return;
14213 if (PyUnicode_CHECK_INTERNED(s))
14214 return;
14215 if (interned == NULL) {
14216 interned = PyDict_New();
14217 if (interned == NULL) {
14218 PyErr_Clear(); /* Don't leave an exception */
14219 return;
14220 }
14221 }
14222 /* It might be that the GetItem call fails even
14223 though the key is present in the dictionary,
14224 namely when this happens during a stack overflow. */
14225 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010014226 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014227 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014228
Benjamin Peterson29060642009-01-31 22:14:21 +000014229 if (t) {
14230 Py_INCREF(t);
14231 Py_DECREF(*p);
14232 *p = t;
14233 return;
14234 }
Walter Dörwald16807132007-05-25 13:52:07 +000014235
Benjamin Peterson14339b62009-01-31 16:36:08 +000014236 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010014237 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014238 PyErr_Clear();
14239 PyThreadState_GET()->recursion_critical = 0;
14240 return;
14241 }
14242 PyThreadState_GET()->recursion_critical = 0;
14243 /* The two references in interned are not counted by refcnt.
14244 The deallocator will take care of this */
14245 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014246 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014247}
14248
14249void
14250PyUnicode_InternImmortal(PyObject **p)
14251{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014252 PyUnicode_InternInPlace(p);
14253 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020014254 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014255 Py_INCREF(*p);
14256 }
Walter Dörwald16807132007-05-25 13:52:07 +000014257}
14258
14259PyObject *
14260PyUnicode_InternFromString(const char *cp)
14261{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014262 PyObject *s = PyUnicode_FromString(cp);
14263 if (s == NULL)
14264 return NULL;
14265 PyUnicode_InternInPlace(&s);
14266 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014267}
14268
Alexander Belopolsky40018472011-02-26 01:02:56 +000014269void
14270_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014271{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014272 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014273 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014274 Py_ssize_t i, n;
14275 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014276
Benjamin Peterson14339b62009-01-31 16:36:08 +000014277 if (interned == NULL || !PyDict_Check(interned))
14278 return;
14279 keys = PyDict_Keys(interned);
14280 if (keys == NULL || !PyList_Check(keys)) {
14281 PyErr_Clear();
14282 return;
14283 }
Walter Dörwald16807132007-05-25 13:52:07 +000014284
Benjamin Peterson14339b62009-01-31 16:36:08 +000014285 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14286 detector, interned unicode strings are not forcibly deallocated;
14287 rather, we give them their stolen references back, and then clear
14288 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014289
Benjamin Peterson14339b62009-01-31 16:36:08 +000014290 n = PyList_GET_SIZE(keys);
14291 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014292 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014293 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014294 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014295 if (PyUnicode_READY(s) == -1) {
14296 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014297 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014298 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014299 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014300 case SSTATE_NOT_INTERNED:
14301 /* XXX Shouldn't happen */
14302 break;
14303 case SSTATE_INTERNED_IMMORTAL:
14304 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014305 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014306 break;
14307 case SSTATE_INTERNED_MORTAL:
14308 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014309 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014310 break;
14311 default:
14312 Py_FatalError("Inconsistent interned string state.");
14313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014314 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014315 }
14316 fprintf(stderr, "total size of all interned strings: "
14317 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14318 "mortal/immortal\n", mortal_size, immortal_size);
14319 Py_DECREF(keys);
14320 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020014321 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000014322}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014323
14324
14325/********************* Unicode Iterator **************************/
14326
14327typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014328 PyObject_HEAD
14329 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014330 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014331} unicodeiterobject;
14332
14333static void
14334unicodeiter_dealloc(unicodeiterobject *it)
14335{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014336 _PyObject_GC_UNTRACK(it);
14337 Py_XDECREF(it->it_seq);
14338 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014339}
14340
14341static int
14342unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14343{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014344 Py_VISIT(it->it_seq);
14345 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014346}
14347
14348static PyObject *
14349unicodeiter_next(unicodeiterobject *it)
14350{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014351 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014352
Benjamin Peterson14339b62009-01-31 16:36:08 +000014353 assert(it != NULL);
14354 seq = it->it_seq;
14355 if (seq == NULL)
14356 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014357 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014359 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14360 int kind = PyUnicode_KIND(seq);
14361 void *data = PyUnicode_DATA(seq);
14362 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14363 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014364 if (item != NULL)
14365 ++it->it_index;
14366 return item;
14367 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014368
Benjamin Peterson14339b62009-01-31 16:36:08 +000014369 Py_DECREF(seq);
14370 it->it_seq = NULL;
14371 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014372}
14373
14374static PyObject *
14375unicodeiter_len(unicodeiterobject *it)
14376{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014377 Py_ssize_t len = 0;
14378 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014379 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014380 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014381}
14382
14383PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14384
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014385static PyObject *
14386unicodeiter_reduce(unicodeiterobject *it)
14387{
14388 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020014389 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014390 it->it_seq, it->it_index);
14391 } else {
14392 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14393 if (u == NULL)
14394 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020014395 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014396 }
14397}
14398
14399PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14400
14401static PyObject *
14402unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14403{
14404 Py_ssize_t index = PyLong_AsSsize_t(state);
14405 if (index == -1 && PyErr_Occurred())
14406 return NULL;
14407 if (index < 0)
14408 index = 0;
14409 it->it_index = index;
14410 Py_RETURN_NONE;
14411}
14412
14413PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14414
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014415static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014416 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014417 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000014418 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14419 reduce_doc},
14420 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
14421 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014422 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014423};
14424
14425PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014426 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14427 "str_iterator", /* tp_name */
14428 sizeof(unicodeiterobject), /* tp_basicsize */
14429 0, /* tp_itemsize */
14430 /* methods */
14431 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14432 0, /* tp_print */
14433 0, /* tp_getattr */
14434 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014435 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014436 0, /* tp_repr */
14437 0, /* tp_as_number */
14438 0, /* tp_as_sequence */
14439 0, /* tp_as_mapping */
14440 0, /* tp_hash */
14441 0, /* tp_call */
14442 0, /* tp_str */
14443 PyObject_GenericGetAttr, /* tp_getattro */
14444 0, /* tp_setattro */
14445 0, /* tp_as_buffer */
14446 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14447 0, /* tp_doc */
14448 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14449 0, /* tp_clear */
14450 0, /* tp_richcompare */
14451 0, /* tp_weaklistoffset */
14452 PyObject_SelfIter, /* tp_iter */
14453 (iternextfunc)unicodeiter_next, /* tp_iternext */
14454 unicodeiter_methods, /* tp_methods */
14455 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014456};
14457
14458static PyObject *
14459unicode_iter(PyObject *seq)
14460{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014461 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014462
Benjamin Peterson14339b62009-01-31 16:36:08 +000014463 if (!PyUnicode_Check(seq)) {
14464 PyErr_BadInternalCall();
14465 return NULL;
14466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014467 if (PyUnicode_READY(seq) == -1)
14468 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014469 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14470 if (it == NULL)
14471 return NULL;
14472 it->it_index = 0;
14473 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014474 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014475 _PyObject_GC_TRACK(it);
14476 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014477}
14478
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014479
14480size_t
14481Py_UNICODE_strlen(const Py_UNICODE *u)
14482{
14483 int res = 0;
14484 while(*u++)
14485 res++;
14486 return res;
14487}
14488
14489Py_UNICODE*
14490Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14491{
14492 Py_UNICODE *u = s1;
14493 while ((*u++ = *s2++));
14494 return s1;
14495}
14496
14497Py_UNICODE*
14498Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14499{
14500 Py_UNICODE *u = s1;
14501 while ((*u++ = *s2++))
14502 if (n-- == 0)
14503 break;
14504 return s1;
14505}
14506
14507Py_UNICODE*
14508Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14509{
14510 Py_UNICODE *u1 = s1;
14511 u1 += Py_UNICODE_strlen(u1);
14512 Py_UNICODE_strcpy(u1, s2);
14513 return s1;
14514}
14515
14516int
14517Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14518{
14519 while (*s1 && *s2 && *s1 == *s2)
14520 s1++, s2++;
14521 if (*s1 && *s2)
14522 return (*s1 < *s2) ? -1 : +1;
14523 if (*s1)
14524 return 1;
14525 if (*s2)
14526 return -1;
14527 return 0;
14528}
14529
14530int
14531Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14532{
14533 register Py_UNICODE u1, u2;
14534 for (; n != 0; n--) {
14535 u1 = *s1;
14536 u2 = *s2;
14537 if (u1 != u2)
14538 return (u1 < u2) ? -1 : +1;
14539 if (u1 == '\0')
14540 return 0;
14541 s1++;
14542 s2++;
14543 }
14544 return 0;
14545}
14546
14547Py_UNICODE*
14548Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14549{
14550 const Py_UNICODE *p;
14551 for (p = s; *p; p++)
14552 if (*p == c)
14553 return (Py_UNICODE*)p;
14554 return NULL;
14555}
14556
14557Py_UNICODE*
14558Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14559{
14560 const Py_UNICODE *p;
14561 p = s + Py_UNICODE_strlen(s);
14562 while (p != s) {
14563 p--;
14564 if (*p == c)
14565 return (Py_UNICODE*)p;
14566 }
14567 return NULL;
14568}
Victor Stinner331ea922010-08-10 16:37:20 +000014569
Victor Stinner71133ff2010-09-01 23:43:53 +000014570Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014571PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014572{
Victor Stinner577db2c2011-10-11 22:12:48 +020014573 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014574 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014576 if (!PyUnicode_Check(unicode)) {
14577 PyErr_BadArgument();
14578 return NULL;
14579 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014580 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014581 if (u == NULL)
14582 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014583 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014584 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014585 PyErr_NoMemory();
14586 return NULL;
14587 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014588 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014589 size *= sizeof(Py_UNICODE);
14590 copy = PyMem_Malloc(size);
14591 if (copy == NULL) {
14592 PyErr_NoMemory();
14593 return NULL;
14594 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014595 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014596 return copy;
14597}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014598
Georg Brandl66c221e2010-10-14 07:04:07 +000014599/* A _string module, to export formatter_parser and formatter_field_name_split
14600 to the string.Formatter class implemented in Python. */
14601
14602static PyMethodDef _string_methods[] = {
14603 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14604 METH_O, PyDoc_STR("split the argument as a field name")},
14605 {"formatter_parser", (PyCFunction) formatter_parser,
14606 METH_O, PyDoc_STR("parse the argument as a format string")},
14607 {NULL, NULL}
14608};
14609
14610static struct PyModuleDef _string_module = {
14611 PyModuleDef_HEAD_INIT,
14612 "_string",
14613 PyDoc_STR("string helper module"),
14614 0,
14615 _string_methods,
14616 NULL,
14617 NULL,
14618 NULL,
14619 NULL
14620};
14621
14622PyMODINIT_FUNC
14623PyInit__string(void)
14624{
14625 return PyModule_Create(&_string_module);
14626}
14627
14628
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014629#ifdef __cplusplus
14630}
14631#endif